ovs/lib/dpif-netdev.c

/*
 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "dpif-netdev.h"

#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <net/if.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>

#ifdef DPDK_NETDEV
#include <rte_cycles.h>
#endif

#include "bitmap.h"
#include "cmap.h"
#include "conntrack.h"
#include "coverage.h"
#include "ct-dpif.h"
#include "csum.h"
#include "dp-packet.h"
#include "dpif.h"
#include "dpif-provider.h"
#include "dummy.h"
#include "fat-rwlock.h"
#include "flow.h"
#include "hmapx.h"
#include "latch.h"
#include "netdev.h"
#include "netdev-vport.h"
#include "netlink.h"
#include "odp-execute.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/ofp-util.h"
#include "openvswitch/ofpbuf.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "packets.h"
#include "poll-loop.h"
#include "pvector.h"
#include "random.h"
#include "seq.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "tnl-neigh-cache.h"
#include "tnl-ports.h"
#include "unixctl.h"
#include "util.h"

VLOG_DEFINE_THIS_MODULE(dpif_netdev);

#define FLOW_DUMP_MAX_BATCH 50
/* Use per thread recirc_depth to prevent recirculation loop. */
#define MAX_RECIRC_DEPTH 5
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)

/* Configuration parameters. */
enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */

/* Protects against changes to 'dp_netdevs'. */
static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;

/* Contains all 'struct dp_netdev's. */
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
    = SHASH_INITIALIZER(&dp_netdevs);

static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);

#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED)
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)

static struct odp_support dp_netdev_support = {
    .max_mpls_depth = SIZE_MAX,
    .recirc = true,
    .ct_state = true,
    .ct_zone = true,
    .ct_mark = true,
    .ct_label = true,
};

/* Stores a miniflow with inline values */

struct netdev_flow_key {
    uint32_t hash;       /* Hash function differs for different users. */
    uint32_t len;        /* Length of the following miniflow (incl. map). */
    struct miniflow mf;
    uint64_t buf[FLOW_MAX_PACKET_U64S];
};

/* Exact match cache for frequently used flows
 *
 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 * search its entries for a miniflow that matches exactly the miniflow of the
 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 *
 * A cache entry holds a reference to its 'dp_netdev_flow'.
 *
 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 * value is the index of a cache entry where the miniflow could be.
 *
 *
 * Thread-safety
 * =============
 *
 * Each pmd_thread has its own private exact match cache.
 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 */

#define EM_FLOW_HASH_SHIFT 13
#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
#define EM_FLOW_HASH_SEGS 2

struct emc_entry {
    struct dp_netdev_flow *flow;
    struct netdev_flow_key key;   /* key.hash used for emc hash value. */
};

struct emc_cache {
    struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
    int sweep_idx;                /* For emc_cache_slow_sweep(). */
};

/* Iterate in the exact match cache through every entry that might contain a
 * miniflow with hash 'HASH'. */
#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
    for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
         (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
         i__ < EM_FLOW_HASH_SEGS;                                            \
         i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)

/* Simple non-wildcarding single-priority classifier. */

/* Time in ms between successive optimizations of the dpcls subtable vector */
#define DPCLS_OPTIMIZATION_INTERVAL 1000

struct dpcls {
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
    odp_port_t in_port;
    struct cmap subtables_map;
    struct pvector subtables;
};

/* A rule to be inserted to the classifier. */
struct dpcls_rule {
    struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
    struct netdev_flow_key *mask; /* Subtable's mask. */
    struct netdev_flow_key flow;  /* Matching key. */
    /* 'flow' must be the last field, additional space is allocated here. */
};

static void dpcls_init(struct dpcls *);
static void dpcls_destroy(struct dpcls *);
static void dpcls_sort_subtable_vector(struct dpcls *);
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
                         const struct netdev_flow_key *mask);
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
static bool dpcls_lookup(struct dpcls *cls,
                         const struct netdev_flow_key keys[],
                         struct dpcls_rule **rules, size_t cnt,
                         int *num_lookups_p);

/* Datapath based on the network device interface from netdev.h.
 *
 *
 * Thread-safety
 * =============
 *
 * Some members, marked 'const', are immutable.  Accessing other members
 * requires synchronization, as noted in more detail below.
 *
 * Acquisition order is, from outermost to innermost:
 *
 *    dp_netdev_mutex (global)
 *    port_mutex
 *    non_pmd_mutex
 */
struct dp_netdev {
    const struct dpif_class *const class;
    const char *const name;
    struct dpif *dpif;
    struct ovs_refcount ref_cnt;
    atomic_flag destroyed;

    /* Ports.
     *
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
     * through 'ports' requires taking 'port_mutex'. */
    struct ovs_mutex port_mutex;
    struct hmap ports;
    struct seq *port_seq;       /* Incremented whenever a port changes. */

    /* Protects access to ofproto-dpif-upcall interface during revalidator
     * thread synchronization. */
    struct fat_rwlock upcall_rwlock;
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
    void *upcall_aux;

    /* Callback function for notifying the purging of dp flows (during
     * reseting pmd deletion). */
    dp_purge_callback *dp_purge_cb;
    void *dp_purge_aux;

    /* Stores all 'struct dp_netdev_pmd_thread's. */
    struct cmap poll_threads;

    /* Protects the access of the 'struct dp_netdev_pmd_thread'
     * instance for non-pmd thread. */
    struct ovs_mutex non_pmd_mutex;

    /* Each pmd thread will store its pointer to
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
    ovsthread_key_t per_pmd_key;

    struct seq *reconfigure_seq;
    uint64_t last_reconfigure_seq;

    /* Cpu mask for pin of pmd threads. */
    char *pmd_cmask;

    uint64_t last_tnl_conf_seq;

    struct conntrack conntrack;
};

static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
                                                    odp_port_t)
    OVS_REQUIRES(dp->port_mutex);

enum dp_stat_type {
    DP_STAT_EXACT_HIT,          /* Packets that had an exact match (emc). */
    DP_STAT_MASKED_HIT,         /* Packets that matched in the flow table. */
    DP_STAT_MISS,               /* Packets that did not match. */
    DP_STAT_LOST,               /* Packets not passed up to the client. */
    DP_STAT_LOOKUP_HIT,         /* Number of subtable lookups for flow table
                                   hits */
    DP_N_STATS
};

enum pmd_cycles_counter_type {
    PMD_CYCLES_POLLING,         /* Cycles spent polling NICs. */
    PMD_CYCLES_PROCESSING,      /* Cycles spent processing packets */
    PMD_N_CYCLES
};

#define XPS_TIMEOUT_MS 500LL

/* Contained by struct dp_netdev_port's 'rxqs' member.  */
struct dp_netdev_rxq {
    struct netdev_rxq *rxq;
    unsigned core_id;           /* Сore to which this queue is pinned. */
};

/* A port in a netdev-based datapath. */
struct dp_netdev_port {
    odp_port_t port_no;
    struct netdev *netdev;
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
    struct netdev_saved_flags *sf;
    struct dp_netdev_rxq *rxqs;
    unsigned n_rxq;             /* Number of elements in 'rxq' */
    bool dynamic_txqs;          /* If true XPS will be used. */
    unsigned *txq_used;         /* Number of threads that uses each tx queue. */
    struct ovs_mutex txq_used_mutex;
    char *type;                 /* Port type as requested by user. */
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
};

/* Contained by struct dp_netdev_flow's 'stats' member.  */
struct dp_netdev_flow_stats {
    atomic_llong used;             /* Last used time, in monotonic msecs. */
    atomic_ullong packet_count;    /* Number of packets matched. */
    atomic_ullong byte_count;      /* Number of bytes matched. */
    atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
};

/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 *
 *
 * Thread-safety
 * =============
 *
 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 *
 * Motivation
 * ----------
 *
 * The thread safety rules described here for "struct dp_netdev_flow" are
 * motivated by two goals:
 *
 *    - Prevent threads that read members of "struct dp_netdev_flow" from
 *      reading bad data due to changes by some thread concurrently modifying
 *      those members.
 *
 *    - Prevent two threads making changes to members of a given "struct
 *      dp_netdev_flow" from interfering with each other.
 *
 *
 * Rules
 * -----
 *
 * A flow 'flow' may be accessed without a risk of being freed during an RCU
 * grace period.  Code that needs to hold onto a flow for a while
 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 *
 * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 * from modification.
 *
 * Some members, marked 'const', are immutable.  Accessing other members
 * requires synchronization, as noted in more detail below.
 */
struct dp_netdev_flow {
    const struct flow flow;      /* Unmasked flow that created this entry. */
    /* Hash table index by unmasked flow. */
    const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
                                 /* 'flow_table'. */
    const ovs_u128 ufid;         /* Unique flow identifier. */
    const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
                                 /* flow. */

    /* Number of references.
     * The classifier owns one reference.
     * Any thread trying to keep a rule from being freed should hold its own
     * reference. */
    struct ovs_refcount ref_cnt;

    bool dead;

    /* Statistics. */
    struct dp_netdev_flow_stats stats;

    /* Actions. */
    OVSRCU_TYPE(struct dp_netdev_actions *) actions;

    /* While processing a group of input packets, the datapath uses the next
     * member to store a pointer to the output batch for the flow.  It is
     * reset after the batch has been sent out (See dp_netdev_queue_batches(),
     * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
    struct packet_batch_per_flow *batch;

    /* Packet classification. */
    struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
    /* 'cr' must be the last member. */
};

static void dp_netdev_flow_unref(struct dp_netdev_flow *);
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
                                         struct flow *);

/* A set of datapath actions within a "struct dp_netdev_flow".
 *
 *
 * Thread-safety
 * =============
 *
 * A struct dp_netdev_actions 'actions' is protected with RCU. */
struct dp_netdev_actions {
    /* These members are immutable: they do not change during the struct's
     * lifetime.  */
    unsigned int size;          /* Size of 'actions', in bytes. */
    struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
};

struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
                                                   size_t);
struct dp_netdev_actions *dp_netdev_flow_get_actions(
    const struct dp_netdev_flow *);
static void dp_netdev_actions_free(struct dp_netdev_actions *);

/* Contained by struct dp_netdev_pmd_thread's 'stats' member.  */
struct dp_netdev_pmd_stats {
    /* Indexed by DP_STAT_*. */
    atomic_ullong n[DP_N_STATS];
};

/* Contained by struct dp_netdev_pmd_thread's 'cycle' member.  */
struct dp_netdev_pmd_cycles {
    /* Indexed by PMD_CYCLES_*. */
    atomic_ullong n[PMD_N_CYCLES];
};

/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
struct rxq_poll {
    struct dp_netdev_port *port;
    struct netdev_rxq *rx;
    struct ovs_list node;
};

/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 * 'tnl_port_cache' or 'tx_ports'. */
struct tx_port {
    struct dp_netdev_port *port;
    int qid;
    long long last_used;
    struct hmap_node node;
};

/* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 * the performance overhead of interrupt processing.  Therefore netdev can
 * not implement rx-wait for these devices.  dpif-netdev needs to poll
 * these device to check for recv buffer.  pmd-thread does polling for
 * devices assigned to itself.
 *
 * DPDK used PMD for accessing NIC.
 *
 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 * I/O of all non-pmd threads.  There will be no actual thread created
 * for the instance.
 *
 * Each struct has its own flow table and classifier.  Packets received
 * from managed ports are looked up in the corresponding pmd thread's
 * flow table, and are executed with the found actions.
 * */
struct dp_netdev_pmd_thread {
    struct dp_netdev *dp;
    struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
    struct cmap_node node;          /* In 'dp->poll_threads'. */

    pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
    struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */

    /* Per thread exact-match cache.  Note, the instance for cpu core
     * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
     * need to be protected by 'non_pmd_mutex'.  Every other instance
     * will only be accessed by its own pmd thread. */
    struct emc_cache flow_cache;

    /* Flow-Table and classifiers
     *
     * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
     * changes to 'classifiers' must be made while still holding the
     * 'flow_mutex'.
     */
    struct ovs_mutex flow_mutex;
    struct cmap flow_table OVS_GUARDED; /* Flow table. */

    /* One classifier per in_port polled by the pmd */
    struct cmap classifiers;
    /* Periodically sort subtable vectors according to hit frequencies */
    long long int next_optimization;

    /* Statistics. */
    struct dp_netdev_pmd_stats stats;

    /* Cycles counters */
    struct dp_netdev_pmd_cycles cycles;

    /* Used to count cicles. See 'cycles_counter_end()' */
    unsigned long long last_cycles;

    struct latch exit_latch;        /* For terminating the pmd thread. */
    atomic_uint change_seq;         /* For reloading pmd ports. */
    pthread_t thread;
    unsigned core_id;               /* CPU core id of this pmd thread. */
    int numa_id;                    /* numa node id of this pmd thread. */
    bool isolated;

    /* Queue id used by this pmd thread to send packets on all netdevs if
     * XPS disabled for this netdev. All static_tx_qid's are unique and less
     * than 'ovs_numa_get_n_cores() + 1'. */
    atomic_int static_tx_qid;

    struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
    /* List of rx queues to poll. */
    struct ovs_list poll_list OVS_GUARDED;
    /* Number of elements in 'poll_list' */
    int poll_cnt;
    /* Map of 'tx_port's used for transmission.  Written by the main thread,
     * read by the pmd thread. */
    struct hmap tx_ports OVS_GUARDED;

    /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
     * ports (that support push_tunnel/pop_tunnel), the other contains ports
     * with at least one txq (that support send).  A port can be in both.
     *
     * There are two separate maps to make sure that we don't try to execute
     * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
     *
     * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
     * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
     * other instance will only be accessed by its own pmd thread. */
    struct hmap tnl_port_cache;
    struct hmap send_port_cache;

    /* Only a pmd thread can write on its own 'cycles' and 'stats'.
     * The main thread keeps 'stats_zero' and 'cycles_zero' as base
     * values and subtracts them from 'stats' and 'cycles' before
     * reporting to the user */
    unsigned long long stats_zero[DP_N_STATS];
    uint64_t cycles_zero[PMD_N_CYCLES];
};

#define PMD_INITIAL_SEQ 1

/* Interface to netdev-based datapath. */
struct dpif_netdev {
    struct dpif dpif;
    struct dp_netdev *dp;
    uint64_t last_port_seq;
};

static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
                              struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex);
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
                            struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex);
static void dp_netdev_free(struct dp_netdev *)
    OVS_REQUIRES(dp_netdev_mutex);
static int do_add_port(struct dp_netdev *dp, const char *devname,
                       const char *type, odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex);
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
    OVS_REQUIRES(dp->port_mutex);
static int dpif_netdev_open(const struct dpif_class *, const char *name,
                            bool create, struct dpif **);
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                      struct dp_packet_batch *,
                                      bool may_steal, const struct flow *flow,
                                      const struct nlattr *actions,
                                      size_t actions_len,
                                      long long now);
static void dp_netdev_input(struct dp_netdev_pmd_thread *,
                            struct dp_packet_batch *, odp_port_t port_no);
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                  struct dp_packet_batch *);

static void dp_netdev_disable_upcall(struct dp_netdev *);
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
                                    struct dp_netdev *dp, unsigned core_id,
                                    int numa_id);
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex);

static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
                                                      unsigned core_id);
static struct dp_netdev_pmd_thread *
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
    OVS_REQUIRES(dp->port_mutex);
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
                                             struct dp_netdev_port *port);
static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
                                       struct dp_netdev_port *port);
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
                                         struct dp_netdev_port *port);
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
                                     struct dp_netdev_port *port,
                                     struct netdev_rxq *rx);
static struct dp_netdev_pmd_thread *
dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex);
static void reconfigure_pmd_threads(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex);
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
    OVS_REQUIRES(pmd->port_mutex);
static inline void
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);

static void
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                               long long now, bool purge);
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
                                      struct tx_port *tx, long long now);

static inline bool emc_entry_alive(struct emc_entry *ce);
static void emc_clear_entry(struct emc_entry *ce);

static void
emc_cache_init(struct emc_cache *flow_cache)
{
    int i;

    flow_cache->sweep_idx = 0;
    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
        flow_cache->entries[i].flow = NULL;
        flow_cache->entries[i].key.hash = 0;
        flow_cache->entries[i].key.len = sizeof(struct miniflow);
        flowmap_init(&flow_cache->entries[i].key.mf.map);
    }
}

static void
emc_cache_uninit(struct emc_cache *flow_cache)
{
    int i;

    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
        emc_clear_entry(&flow_cache->entries[i]);
    }
}

/* Check and clear dead flow references slowly (one entry at each
 * invocation).  */
static void
emc_cache_slow_sweep(struct emc_cache *flow_cache)
{
    struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];

    if (!emc_entry_alive(entry)) {
        emc_clear_entry(entry);
    }
    flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
}

/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
bool
dpif_is_netdev(const struct dpif *dpif)
{
    return dpif->dpif_class->open == dpif_netdev_open;
}

static struct dpif_netdev *
dpif_netdev_cast(const struct dpif *dpif)
{
    ovs_assert(dpif_is_netdev(dpif));
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
}

static struct dp_netdev *
get_dp_netdev(const struct dpif *dpif)
{
    return dpif_netdev_cast(dpif)->dp;
}

enum pmd_info_type {
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
    PMD_INFO_SHOW_RXQ     /* Show poll-lists of pmd threads. */
};

static void
pmd_info_show_stats(struct ds *reply,
                    struct dp_netdev_pmd_thread *pmd,
                    unsigned long long stats[DP_N_STATS],
                    uint64_t cycles[PMD_N_CYCLES])
{
    unsigned long long total_packets = 0;
    uint64_t total_cycles = 0;
    int i;

    /* These loops subtracts reference values ('*_zero') from the counters.
     * Since loads and stores are relaxed, it might be possible for a '*_zero'
     * value to be more recent than the current value we're reading from the
     * counter.  This is not a big problem, since these numbers are not
     * supposed to be too accurate, but we should at least make sure that
     * the result is not negative. */
    for (i = 0; i < DP_N_STATS; i++) {
        if (stats[i] > pmd->stats_zero[i]) {
            stats[i] -= pmd->stats_zero[i];
        } else {
            stats[i] = 0;
        }

        if (i != DP_STAT_LOST) {
            /* Lost packets are already included in DP_STAT_MISS */
            total_packets += stats[i];
        }
    }

    for (i = 0; i < PMD_N_CYCLES; i++) {
        if (cycles[i] > pmd->cycles_zero[i]) {
           cycles[i] -= pmd->cycles_zero[i];
        } else {
            cycles[i] = 0;
        }

        total_cycles += cycles[i];
    }

    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
                        ? "main thread" : "pmd thread");

    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
    }
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
        ds_put_format(reply, " core_id %u", pmd->core_id);
    }
    ds_put_cstr(reply, ":\n");

    ds_put_format(reply,
                  "\temc hits:%llu\n\tmegaflow hits:%llu\n"
                  "\tavg. subtable lookups per hit:%.2f\n"
                  "\tmiss:%llu\n\tlost:%llu\n",
                  stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
                  stats[DP_STAT_MASKED_HIT] > 0
                  ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
                  : 0,
                  stats[DP_STAT_MISS], stats[DP_STAT_LOST]);

    if (total_cycles == 0) {
        return;
    }

    ds_put_format(reply,
                  "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
                  "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
                  cycles[PMD_CYCLES_POLLING],
                  cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
                  cycles[PMD_CYCLES_PROCESSING],
                  cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);

    if (total_packets == 0) {
        return;
    }

    ds_put_format(reply,
                  "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
                  total_cycles / (double)total_packets,
                  total_cycles, total_packets);

    ds_put_format(reply,
                  "\tavg processing cycles per packet: "
                  "%.02f (%"PRIu64"/%llu)\n",
                  cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
                  cycles[PMD_CYCLES_PROCESSING], total_packets);
}

static void
pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
                    struct dp_netdev_pmd_thread *pmd,
                    unsigned long long stats[DP_N_STATS],
                    uint64_t cycles[PMD_N_CYCLES])
{
    int i;

    /* We cannot write 'stats' and 'cycles' (because they're written by other
     * threads) and we shouldn't change 'stats' (because they're used to count
     * datapath stats, which must not be cleared here).  Instead, we save the
     * current values and subtract them from the values to be displayed in the
     * future */
    for (i = 0; i < DP_N_STATS; i++) {
        pmd->stats_zero[i] = stats[i];
    }
    for (i = 0; i < PMD_N_CYCLES; i++) {
        pmd->cycles_zero[i] = cycles[i];
    }
}

static void
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
{
    if (pmd->core_id != NON_PMD_CORE_ID) {
        struct rxq_poll *poll;
        const char *prev_name = NULL;

        ds_put_format(reply,
                      "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
                                                  ? "true" : "false");

        ovs_mutex_lock(&pmd->port_mutex);
        LIST_FOR_EACH (poll, node, &pmd->poll_list) {
            const char *name = netdev_get_name(poll->port->netdev);

            if (!prev_name || strcmp(name, prev_name)) {
                if (prev_name) {
                    ds_put_cstr(reply, "\n");
                }
                ds_put_format(reply, "\tport: %s\tqueue-id:",
                              netdev_get_name(poll->port->netdev));
            }
            ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
            prev_name = name;
        }
        ovs_mutex_unlock(&pmd->port_mutex);
        ds_put_cstr(reply, "\n");
    }
}

static void
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
                     void *aux)
{
    struct ds reply = DS_EMPTY_INITIALIZER;
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev *dp = NULL;
    enum pmd_info_type type = *(enum pmd_info_type *) aux;

    ovs_mutex_lock(&dp_netdev_mutex);

    if (argc == 2) {
        dp = shash_find_data(&dp_netdevs, argv[1]);
    } else if (shash_count(&dp_netdevs) == 1) {
        /* There's only one datapath */
        dp = shash_first(&dp_netdevs)->data;
    }

    if (!dp) {
        ovs_mutex_unlock(&dp_netdev_mutex);
        unixctl_command_reply_error(conn,
                                    "please specify an existing datapath");
        return;
    }

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (type == PMD_INFO_SHOW_RXQ) {
            pmd_info_show_rxq(&reply, pmd);
        } else {
            unsigned long long stats[DP_N_STATS];
            uint64_t cycles[PMD_N_CYCLES];
            int i;

            /* Read current stats and cycle counters */
            for (i = 0; i < ARRAY_SIZE(stats); i++) {
                atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
            }
            for (i = 0; i < ARRAY_SIZE(cycles); i++) {
                atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
            }

            if (type == PMD_INFO_CLEAR_STATS) {
                pmd_info_clear_stats(&reply, pmd, stats, cycles);
            } else if (type == PMD_INFO_SHOW_STATS) {
                pmd_info_show_stats(&reply, pmd, stats, cycles);
            }
        }
    }

    ovs_mutex_unlock(&dp_netdev_mutex);

    unixctl_command_reply(conn, ds_cstr(&reply));
    ds_destroy(&reply);
}

static int
dpif_netdev_init(void)
{
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
                              clear_aux = PMD_INFO_CLEAR_STATS,
                              poll_aux = PMD_INFO_SHOW_RXQ;

    unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
                             0, 1, dpif_netdev_pmd_info,
                             (void *)&show_aux);
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
                             0, 1, dpif_netdev_pmd_info,
                             (void *)&clear_aux);
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
                             0, 1, dpif_netdev_pmd_info,
                             (void *)&poll_aux);
    return 0;
}

static int
dpif_netdev_enumerate(struct sset *all_dps,
                      const struct dpif_class *dpif_class)
{
    struct shash_node *node;

    ovs_mutex_lock(&dp_netdev_mutex);
    SHASH_FOR_EACH(node, &dp_netdevs) {
        struct dp_netdev *dp = node->data;
        if (dpif_class != dp->class) {
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
             * If the class doesn't match, skip this dpif. */
             continue;
        }
        sset_add(all_dps, node->name);
    }
    ovs_mutex_unlock(&dp_netdev_mutex);

    return 0;
}

static bool
dpif_netdev_class_is_dummy(const struct dpif_class *class)
{
    return class != &dpif_netdev_class;
}

static const char *
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
{
    return strcmp(type, "internal") ? type
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
                  : "tap";
}

static struct dpif *
create_dpif_netdev(struct dp_netdev *dp)
{
    uint16_t netflow_id = hash_string(dp->name, 0);
    struct dpif_netdev *dpif;

    ovs_refcount_ref(&dp->ref_cnt);

    dpif = xmalloc(sizeof *dpif);
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
    dpif->dp = dp;
    dpif->last_port_seq = seq_read(dp->port_seq);

    return &dpif->dpif;
}

/* Choose an unused, non-zero port number and return it on success.
 * Return ODPP_NONE on failure. */
static odp_port_t
choose_port(struct dp_netdev *dp, const char *name)
    OVS_REQUIRES(dp->port_mutex)
{
    uint32_t port_no;

    if (dp->class != &dpif_netdev_class) {
        const char *p;
        int start_no = 0;

        /* If the port name begins with "br", start the number search at
         * 100 to make writing tests easier. */
        if (!strncmp(name, "br", 2)) {
            start_no = 100;
        }

        /* If the port name contains a number, try to assign that port number.
         * This can make writing unit tests easier because port numbers are
         * predictable. */
        for (p = name; *p != '\0'; p++) {
            if (isdigit((unsigned char) *p)) {
                port_no = start_no + strtol(p, NULL, 10);
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
                    return u32_to_odp(port_no);
                }
                break;
            }
        }
    }

    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
            return u32_to_odp(port_no);
        }
    }

    return ODPP_NONE;
}

static int
create_dp_netdev(const char *name, const struct dpif_class *class,
                 struct dp_netdev **dpp)
    OVS_REQUIRES(dp_netdev_mutex)
{
    struct dp_netdev *dp;
    int error;

    dp = xzalloc(sizeof *dp);
    shash_add(&dp_netdevs, name, dp);

    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
    ovs_refcount_init(&dp->ref_cnt);
    atomic_flag_clear(&dp->destroyed);

    ovs_mutex_init(&dp->port_mutex);
    hmap_init(&dp->ports);
    dp->port_seq = seq_create();
    fat_rwlock_init(&dp->upcall_rwlock);

    dp->reconfigure_seq = seq_create();
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);

    /* Disable upcalls by default. */
    dp_netdev_disable_upcall(dp);
    dp->upcall_aux = NULL;
    dp->upcall_cb = NULL;

    conntrack_init(&dp->conntrack);

    cmap_init(&dp->poll_threads);
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
    ovsthread_key_create(&dp->per_pmd_key, NULL);

    ovs_mutex_lock(&dp->port_mutex);
    dp_netdev_set_nonpmd(dp);

    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
                                                             "internal"),
                        ODPP_LOCAL);
    ovs_mutex_unlock(&dp->port_mutex);
    if (error) {
        dp_netdev_free(dp);
        return error;
    }

    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
    *dpp = dp;
    return 0;
}

static void
dp_netdev_request_reconfigure(struct dp_netdev *dp)
{
    seq_change(dp->reconfigure_seq);
}

static bool
dp_netdev_is_reconf_required(struct dp_netdev *dp)
{
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
}

static int
dpif_netdev_open(const struct dpif_class *class, const char *name,
                 bool create, struct dpif **dpifp)
{
    struct dp_netdev *dp;
    int error;

    ovs_mutex_lock(&dp_netdev_mutex);
    dp = shash_find_data(&dp_netdevs, name);
    if (!dp) {
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
    } else {
        error = (dp->class != class ? EINVAL
                 : create ? EEXIST
                 : 0);
    }
    if (!error) {
        *dpifp = create_dpif_netdev(dp);
        dp->dpif = *dpifp;
    }
    ovs_mutex_unlock(&dp_netdev_mutex);

    return error;
}

static void
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));

    /* Before freeing a lock we should release it */
    fat_rwlock_unlock(&dp->upcall_rwlock);
    fat_rwlock_destroy(&dp->upcall_rwlock);
}

/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
 * through the 'dp_netdevs' shash while freeing 'dp'. */
static void
dp_netdev_free(struct dp_netdev *dp)
    OVS_REQUIRES(dp_netdev_mutex)
{
    struct dp_netdev_port *port, *next;

    shash_find_and_delete(&dp_netdevs, dp->name);

    dp_netdev_destroy_all_pmds(dp);
    ovs_mutex_destroy(&dp->non_pmd_mutex);
    ovsthread_key_delete(dp->per_pmd_key);

    conntrack_destroy(&dp->conntrack);

    ovs_mutex_lock(&dp->port_mutex);
    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
        do_del_port(dp, port);
    }
    ovs_mutex_unlock(&dp->port_mutex);
    cmap_destroy(&dp->poll_threads);

    seq_destroy(dp->reconfigure_seq);

    seq_destroy(dp->port_seq);
    hmap_destroy(&dp->ports);
    ovs_mutex_destroy(&dp->port_mutex);

    /* Upcalls must be disabled at this point */
    dp_netdev_destroy_upcall_lock(dp);

    free(dp->pmd_cmask);
    free(CONST_CAST(char *, dp->name));
    free(dp);
}

static void
dp_netdev_unref(struct dp_netdev *dp)
{
    if (dp) {
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
        ovs_mutex_lock(&dp_netdev_mutex);
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
            dp_netdev_free(dp);
        }
        ovs_mutex_unlock(&dp_netdev_mutex);
    }
}

static void
dpif_netdev_close(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    dp_netdev_unref(dp);
    free(dpif);
}

static int
dpif_netdev_destroy(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    if (!atomic_flag_test_and_set(&dp->destroyed)) {
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
            OVS_NOT_REACHED();
        }
    }

    return 0;
}

/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
 * load/store semantics.  While the increment is not atomic, the load and
 * store operations are, making it impossible to read inconsistent values.
 *
 * This is used to update thread local stats counters. */
static void
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
{
    unsigned long long tmp;

    atomic_read_relaxed(var, &tmp);
    tmp += n;
    atomic_store_relaxed(var, tmp);
}

static int
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;

    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        unsigned long long n;
        stats->n_flows += cmap_count(&pmd->flow_table);

        atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
        stats->n_hit += n;
        atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
        stats->n_hit += n;
        atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
        stats->n_missed += n;
        atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
        stats->n_lost += n;
    }
    stats->n_masks = UINT32_MAX;
    stats->n_mask_hit = UINT64_MAX;

    return 0;
}

static void
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
{
    int old_seq;

    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
        ovs_mutex_lock(&pmd->port_mutex);
        pmd_load_cached_ports(pmd);
        ovs_mutex_unlock(&pmd->port_mutex);
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
        return;
    }

    ovs_mutex_lock(&pmd->cond_mutex);
    atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
    ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
    ovs_mutex_unlock(&pmd->cond_mutex);
}

static uint32_t
hash_port_no(odp_port_t port_no)
{
    return hash_int(odp_to_u32(port_no), 0);
}

static int
port_create(const char *devname, const char *type,
            odp_port_t port_no, struct dp_netdev_port **portp)
{
    struct netdev_saved_flags *sf;
    struct dp_netdev_port *port;
    enum netdev_flags flags;
    struct netdev *netdev;
    int n_open_rxqs = 0;
    int n_cores = 0;
    int i, error;
    bool dynamic_txqs = false;

    *portp = NULL;

    /* Open and validate network device. */
    error = netdev_open(devname, type, &netdev);
    if (error) {
        return error;
    }
    /* XXX reject non-Ethernet devices */

    netdev_get_flags(netdev, &flags);
    if (flags & NETDEV_LOOPBACK) {
        VLOG_ERR("%s: cannot add a loopback device", devname);
        error = EINVAL;
        goto out;
    }

    if (netdev_is_pmd(netdev)) {
        n_cores = ovs_numa_get_n_cores();

        if (n_cores == OVS_CORE_UNSPEC) {
            VLOG_ERR("%s, cannot get cpu core info", devname);
            error = ENOENT;
            goto out;
        }
        /* There can only be ovs_numa_get_n_cores() pmd threads,
         * so creates a txq for each, and one extra for the non
         * pmd threads. */
        error = netdev_set_tx_multiq(netdev, n_cores + 1);
        if (error && (error != EOPNOTSUPP)) {
            VLOG_ERR("%s, cannot set multiq", devname);
            goto out;
        }
    }

    if (netdev_is_reconf_required(netdev)) {
        error = netdev_reconfigure(netdev);
        if (error) {
            goto out;
        }
    }

    if (netdev_is_pmd(netdev)) {
        if (netdev_n_txq(netdev) < n_cores + 1) {
            dynamic_txqs = true;
        }
    }

    port = xzalloc(sizeof *port);
    port->port_no = port_no;
    port->netdev = netdev;
    port->n_rxq = netdev_n_rxq(netdev);
    port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
    port->type = xstrdup(type);
    ovs_mutex_init(&port->txq_used_mutex);
    port->dynamic_txqs = dynamic_txqs;

    for (i = 0; i < port->n_rxq; i++) {
        error = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
        if (error) {
            VLOG_ERR("%s: cannot receive packets on this network device (%s)",
                     devname, ovs_strerror(errno));
            goto out_rxq_close;
        }
        port->rxqs[i].core_id = OVS_CORE_UNSPEC;
        n_open_rxqs++;
    }

    error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
    if (error) {
        goto out_rxq_close;
    }
    port->sf = sf;

    *portp = port;

    return 0;

out_rxq_close:
    for (i = 0; i < n_open_rxqs; i++) {
        netdev_rxq_close(port->rxqs[i].rxq);
    }
    ovs_mutex_destroy(&port->txq_used_mutex);
    free(port->type);
    free(port->txq_used);
    free(port->rxqs);
    free(port);

out:
    netdev_close(netdev);
    return error;
}

static int
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
            odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;
    int error;

    /* Reject devices already in 'dp'. */
    if (!get_port_by_name(dp, devname, &port)) {
        return EEXIST;
    }

    error = port_create(devname, type, port_no, &port);
    if (error) {
        return error;
    }

    if (netdev_is_pmd(port->netdev)) {
        int numa_id = netdev_get_numa_id(port->netdev);

        ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
        dp_netdev_set_pmds_on_numa(dp, numa_id);
    }

    dp_netdev_add_port_to_pmds(dp, port);

    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
    seq_change(dp->port_seq);

    return 0;
}

static int
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
                     odp_port_t *port_nop)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
    const char *dpif_port;
    odp_port_t port_no;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
    if (*port_nop != ODPP_NONE) {
        port_no = *port_nop;
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
    } else {
        port_no = choose_port(dp, dpif_port);
        error = port_no == ODPP_NONE ? EFBIG : 0;
    }
    if (!error) {
        *port_nop = port_no;
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static int
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    if (port_no == ODPP_LOCAL) {
        error = EINVAL;
    } else {
        struct dp_netdev_port *port;

        error = get_port_by_number(dp, port_no, &port);
        if (!error) {
            do_del_port(dp, port);
        }
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static bool
is_valid_port_number(odp_port_t port_no)
{
    return port_no != ODPP_NONE;
}

static struct dp_netdev_port *
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
        if (port->port_no == port_no) {
            return port;
        }
    }
    return NULL;
}

static int
get_port_by_number(struct dp_netdev *dp,
                   odp_port_t port_no, struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex)
{
    if (!is_valid_port_number(port_no)) {
        *portp = NULL;
        return EINVAL;
    } else {
        *portp = dp_netdev_lookup_port(dp, port_no);
        return *portp ? 0 : ENODEV;
    }
}

static void
port_destroy(struct dp_netdev_port *port)
{
    if (!port) {
        return;
    }

    netdev_close(port->netdev);
    netdev_restore_flags(port->sf);

    for (unsigned i = 0; i < port->n_rxq; i++) {
        netdev_rxq_close(port->rxqs[i].rxq);
    }
    ovs_mutex_destroy(&port->txq_used_mutex);
    free(port->rxq_affinity_list);
    free(port->txq_used);
    free(port->rxqs);
    free(port->type);
    free(port);
}

static int
get_port_by_name(struct dp_netdev *dp,
                 const char *devname, struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
            *portp = port;
            return 0;
        }
    }

    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
     * existing port. */
    return ENODEV;
}

static int
get_n_pmd_threads(struct dp_netdev *dp)
{
    /* There is one non pmd thread in dp->poll_threads */
    return cmap_count(&dp->poll_threads) - 1;
}

static int
get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
{
    struct dp_netdev_pmd_thread *pmd;
    int n_pmds = 0;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (pmd->numa_id == numa_id) {
            n_pmds++;
        }
    }

    return n_pmds;
}

/* Returns 'true' if there is a port with pmd netdev and the netdev is on
 * numa node 'numa_id' or its rx queue assigned to core on that numa node . */
static bool
has_pmd_rxq_for_numa(struct dp_netdev *dp, int numa_id)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_pmd(port->netdev)) {
            int i;

            if (netdev_get_numa_id(port->netdev) == numa_id) {
                return true;
            }

            for (i = 0; i < port->n_rxq; i++) {
                unsigned core_id = port->rxqs[i].core_id;

                if (core_id != OVS_CORE_UNSPEC
                    && ovs_numa_get_numa_id(core_id) == numa_id) {
                    return true;
                }
            }
        }
    }

    return false;
}


static void
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
    OVS_REQUIRES(dp->port_mutex)
{
    hmap_remove(&dp->ports, &port->node);
    seq_change(dp->port_seq);

    dp_netdev_del_port_from_all_pmds(dp, port);

    if (netdev_is_pmd(port->netdev)) {
        int numa_id = netdev_get_numa_id(port->netdev);

        /* PMD threads can not be on invalid numa node. */
        ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
        /* If there is no netdev on the numa node, deletes the pmd threads
         * for that numa. */
        if (!has_pmd_rxq_for_numa(dp, numa_id)) {
            dp_netdev_del_pmds_on_numa(dp, numa_id);
        }
    }

    port_destroy(port);
}

static void
answer_port_query(const struct dp_netdev_port *port,
                  struct dpif_port *dpif_port)
{
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
    dpif_port->type = xstrdup(port->type);
    dpif_port->port_no = port->port_no;
}

static int
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
                                 struct dpif_port *dpif_port)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_number(dp, port_no, &port);
    if (!error && dpif_port) {
        answer_port_query(port, dpif_port);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static int
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
                               struct dpif_port *dpif_port)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_name(dp, devname, &port);
    if (!error && dpif_port) {
        answer_port_query(port, dpif_port);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static void
dp_netdev_flow_free(struct dp_netdev_flow *flow)
{
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
    free(flow);
}

static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
{
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
        ovsrcu_postpone(dp_netdev_flow_free, flow);
    }
}

static uint32_t
dp_netdev_flow_hash(const ovs_u128 *ufid)
{
    return ufid->u32[0];
}

static inline struct dpcls *
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
                           odp_port_t in_port)
{
    struct dpcls *cls;
    uint32_t hash = hash_port_no(in_port);
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
        if (cls->in_port == in_port) {
            /* Port classifier exists already */
            return cls;
        }
    }
    return NULL;
}

static inline struct dpcls *
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
                         odp_port_t in_port)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    uint32_t hash = hash_port_no(in_port);

    if (!cls) {
        /* Create new classifier for in_port */
        cls = xmalloc(sizeof(*cls));
        dpcls_init(cls);
        cls->in_port = in_port;
        cmap_insert(&pmd->classifiers, &cls->node, hash);
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
    }
    return cls;
}

static void
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
                          struct dp_netdev_flow *flow)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
    struct dpcls *cls;
    odp_port_t in_port = flow->flow.in_port.odp_port;

    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    ovs_assert(cls != NULL);
    dpcls_remove(cls, &flow->cr);
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
    flow->dead = true;

    dp_netdev_flow_unref(flow);
}

static void
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
{
    struct dp_netdev_flow *netdev_flow;

    ovs_mutex_lock(&pmd->flow_mutex);
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
    }
    ovs_mutex_unlock(&pmd->flow_mutex);
}

static int
dpif_netdev_flow_flush(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        dp_netdev_pmd_flow_flush(pmd);
    }

    return 0;
}

struct dp_netdev_port_state {
    struct hmap_position position;
    char *name;
};

static int
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
{
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
    return 0;
}

static int
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
                           struct dpif_port *dpif_port)
{
    struct dp_netdev_port_state *state = state_;
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct hmap_node *node;
    int retval;

    ovs_mutex_lock(&dp->port_mutex);
    node = hmap_at_position(&dp->ports, &state->position);
    if (node) {
        struct dp_netdev_port *port;

        port = CONTAINER_OF(node, struct dp_netdev_port, node);

        free(state->name);
        state->name = xstrdup(netdev_get_name(port->netdev));
        dpif_port->name = state->name;
        dpif_port->type = port->type;
        dpif_port->port_no = port->port_no;

        retval = 0;
    } else {
        retval = EOF;
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return retval;
}

static int
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
{
    struct dp_netdev_port_state *state = state_;
    free(state->name);
    free(state);
    return 0;
}

static int
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
{
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
    uint64_t new_port_seq;
    int error;

    new_port_seq = seq_read(dpif->dp->port_seq);
    if (dpif->last_port_seq != new_port_seq) {
        dpif->last_port_seq = new_port_seq;
        error = ENOBUFS;
    } else {
        error = EAGAIN;
    }

    return error;
}

static void
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
{
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);

    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
}

static struct dp_netdev_flow *
dp_netdev_flow_cast(const struct dpcls_rule *cr)
{
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
}

static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
{
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
}

/* netdev_flow_key utilities.
 *
 * netdev_flow_key is basically a miniflow.  We use these functions
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
 *
 * - Since we are dealing exclusively with miniflows created by
 *   miniflow_extract(), if the map is different the miniflow is different.
 *   Therefore we can be faster by comparing the map and the miniflow in a
 *   single memcmp().
 * - These functions can be inlined by the compiler. */

/* Given the number of bits set in miniflow's maps, returns the size of the
 * 'netdev_flow_key.mf' */
static inline size_t
netdev_flow_key_size(size_t flow_u64s)
{
    return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
}

static inline bool
netdev_flow_key_equal(const struct netdev_flow_key *a,
                      const struct netdev_flow_key *b)
{
    /* 'b->len' may be not set yet. */
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
}

/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
 * generated by miniflow_extract. */
static inline bool
netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
                         const struct miniflow *mf)
{
    return !memcmp(&key->mf, mf, key->len);
}

static inline void
netdev_flow_key_clone(struct netdev_flow_key *dst,
                      const struct netdev_flow_key *src)
{
    memcpy(dst, src,
           offsetof(struct netdev_flow_key, mf) + src->len);
}

/* Slow. */
static void
netdev_flow_key_from_flow(struct netdev_flow_key *dst,
                          const struct flow *src)
{
    struct dp_packet packet;
    uint64_t buf_stub[512 / 8];

    dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
    pkt_metadata_from_flow(&packet.md, src);
    flow_compose(&packet, src);
    miniflow_extract(&packet, &dst->mf);
    dp_packet_uninit(&packet);

    dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
    dst->hash = 0; /* Not computed yet. */
}

/* Initialize a netdev_flow_key 'mask' from 'match'. */
static inline void
netdev_flow_mask_init(struct netdev_flow_key *mask,
                      const struct match *match)
{
    uint64_t *dst = miniflow_values(&mask->mf);
    struct flowmap fmap;
    uint32_t hash = 0;
    size_t idx;

    /* Only check masks that make sense for the flow. */
    flow_wc_map(&match->flow, &fmap);
    flowmap_init(&mask->mf.map);

    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);

        if (mask_u64) {
            flowmap_set(&mask->mf.map, idx, 1);
            *dst++ = mask_u64;
            hash = hash_add64(hash, mask_u64);
        }
    }

    map_t map;

    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
        hash = hash_add64(hash, map);
    }

    size_t n = dst - miniflow_get_values(&mask->mf);

    mask->hash = hash_finish(hash, n * 8);
    mask->len = netdev_flow_key_size(n);
}

/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
static inline void
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
                            const struct flow *flow,
                            const struct netdev_flow_key *mask)
{
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
    uint32_t hash = 0;
    uint64_t value;

    dst->len = mask->len;
    dst->mf = mask->mf;   /* Copy maps. */

    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
        *dst_u64 = value & *mask_u64++;
        hash = hash_add64(hash, *dst_u64++);
    }
    dst->hash = hash_finish(hash,
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
}

/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
    MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)

/* Returns a hash value for the bits of 'key' where there are 1-bits in
 * 'mask'. */
static inline uint32_t
netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
                             const struct netdev_flow_key *mask)
{
    const uint64_t *p = miniflow_get_values(&mask->mf);
    uint32_t hash = 0;
    uint64_t value;

    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
        hash = hash_add64(hash, value & *p++);
    }

    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
}

static inline bool
emc_entry_alive(struct emc_entry *ce)
{
    return ce->flow && !ce->flow->dead;
}

static void
emc_clear_entry(struct emc_entry *ce)
{
    if (ce->flow) {
        dp_netdev_flow_unref(ce->flow);
        ce->flow = NULL;
    }
}

static inline void
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
                 const struct netdev_flow_key *key)
{
    if (ce->flow != flow) {
        if (ce->flow) {
            dp_netdev_flow_unref(ce->flow);
        }

        if (dp_netdev_flow_ref(flow)) {
            ce->flow = flow;
        } else {
            ce->flow = NULL;
        }
    }
    if (key) {
        netdev_flow_key_clone(&ce->key, key);
    }
}

static inline void
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
           struct dp_netdev_flow *flow)
{
    struct emc_entry *to_be_replaced = NULL;
    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
        if (netdev_flow_key_equal(&current_entry->key, key)) {
            /* We found the entry with the 'mf' miniflow */
            emc_change_entry(current_entry, flow, NULL);
            return;
        }

        /* Replacement policy: put the flow in an empty (not alive) entry, or
         * in the first entry where it can be */
        if (!to_be_replaced
            || (emc_entry_alive(to_be_replaced)
                && !emc_entry_alive(current_entry))
            || current_entry->key.hash < to_be_replaced->key.hash) {
            to_be_replaced = current_entry;
        }
    }
    /* We didn't find the miniflow in the cache.
     * The 'to_be_replaced' entry is where the new flow will be stored */

    emc_change_entry(to_be_replaced, flow, key);
}

static inline struct dp_netdev_flow *
emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
{
    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
        if (current_entry->key.hash == key->hash
            && emc_entry_alive(current_entry)
            && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {

            /* We found the entry with the 'key->mf' miniflow */
            return current_entry->flow;
        }
    }

    return NULL;
}

static struct dp_netdev_flow *
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
                          const struct netdev_flow_key *key,
                          int *lookup_num_p)
{
    struct dpcls *cls;
    struct dpcls_rule *rule;
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
    struct dp_netdev_flow *netdev_flow = NULL;

    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    if (OVS_LIKELY(cls)) {
        dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
        netdev_flow = dp_netdev_flow_cast(rule);
    }
    return netdev_flow;
}

static struct dp_netdev_flow *
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
                        const ovs_u128 *ufidp, const struct nlattr *key,
                        size_t key_len)
{
    struct dp_netdev_flow *netdev_flow;
    struct flow flow;
    ovs_u128 ufid;

    /* If a UFID is not provided, determine one based on the key. */
    if (!ufidp && key && key_len
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
        dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
        ufidp = &ufid;
    }

    if (ufidp) {
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
                                 &pmd->flow_table) {
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
                return netdev_flow;
            }
        }
    }

    return NULL;
}

static void
get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
                    struct dpif_flow_stats *stats)
{
    struct dp_netdev_flow *netdev_flow;
    unsigned long long n;
    long long used;
    uint16_t flags;

    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);

    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
    stats->n_packets = n;
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
    stats->n_bytes = n;
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
    stats->used = used;
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
    stats->tcp_flags = flags;
}

/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
 * protect them. */
static void
dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
                            struct dpif_flow *flow, bool terse)
{
    if (terse) {
        memset(flow, 0, sizeof *flow);
    } else {
        struct flow_wildcards wc;
        struct dp_netdev_actions *actions;
        size_t offset;
        struct odp_flow_key_parms odp_parms = {
            .flow = &netdev_flow->flow,
            .mask = &wc.masks,
            .support = dp_netdev_support,
        };

        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
        /* in_port is exact matched, but we have left it out from the mask for
         * optimnization reasons. Add in_port back to the mask. */
        wc.masks.in_port.odp_port = ODPP_NONE;

        /* Key */
        offset = key_buf->size;
        flow->key = ofpbuf_tail(key_buf);
        odp_flow_key_from_flow(&odp_parms, key_buf);
        flow->key_len = key_buf->size - offset;

        /* Mask */
        offset = mask_buf->size;
        flow->mask = ofpbuf_tail(mask_buf);
        odp_parms.key_buf = key_buf;
        odp_flow_key_from_mask(&odp_parms, mask_buf);
        flow->mask_len = mask_buf->size - offset;

        /* Actions */
        actions = dp_netdev_flow_get_actions(netdev_flow);
        flow->actions = actions->actions;
        flow->actions_len = actions->size;
    }

    flow->ufid = netdev_flow->ufid;
    flow->ufid_present = true;
    flow->pmd_id = netdev_flow->pmd_id;
    get_dpif_flow_stats(netdev_flow, &flow->stats);
}

static int
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
                              const struct nlattr *mask_key,
                              uint32_t mask_key_len, const struct flow *flow,
                              struct flow_wildcards *wc)
{
    enum odp_key_fitness fitness;

    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
    if (fitness) {
        /* This should not happen: it indicates that
         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
         * disagree on the acceptable form of a mask.  Log the problem
         * as an error, with enough details to enable debugging. */
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

        if (!VLOG_DROP_ERR(&rl)) {
            struct ds s;

            ds_init(&s);
            odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
                            true);
            VLOG_ERR("internal error parsing flow mask %s (%s)",
                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
            ds_destroy(&s);
        }

        return EINVAL;
    }

    return 0;
}

static int
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
                              struct flow *flow)
{
    odp_port_t in_port;

    if (odp_flow_key_to_flow(key, key_len, flow)) {
        /* This should not happen: it indicates that odp_flow_key_from_flow()
         * and odp_flow_key_to_flow() disagree on the acceptable form of a
         * flow.  Log the problem as an error, with enough details to enable
         * debugging. */
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

        if (!VLOG_DROP_ERR(&rl)) {
            struct ds s;

            ds_init(&s);
            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
            ds_destroy(&s);
        }

        return EINVAL;
    }

    in_port = flow->in_port.odp_port;
    if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
        return EINVAL;
    }

    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
        return EINVAL;
    }

    return 0;
}

static int
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_flow *netdev_flow;
    struct dp_netdev_pmd_thread *pmd;
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
    struct hmapx_node *node;
    int error = EINVAL;

    if (get->pmd_id == PMD_ID_NULL) {
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
                dp_netdev_pmd_unref(pmd);
            }
        }
    } else {
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
        if (!pmd) {
            goto out;
        }
        hmapx_add(&to_find, pmd);
    }

    if (!hmapx_count(&to_find)) {
        goto out;
    }

    HMAPX_FOR_EACH (node, &to_find) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
                                              get->key_len);
        if (netdev_flow) {
            dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
                                        get->flow, false);
            error = 0;
            break;
        } else {
            error = ENOENT;
        }
    }

    HMAPX_FOR_EACH (node, &to_find) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        dp_netdev_pmd_unref(pmd);
    }
out:
    hmapx_destroy(&to_find);
    return error;
}

static struct dp_netdev_flow *
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
                   struct match *match, const ovs_u128 *ufid,
                   const struct nlattr *actions, size_t actions_len)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct dp_netdev_flow *flow;
    struct netdev_flow_key mask;
    struct dpcls *cls;

    /* Make sure in_port is exact matched before we read it. */
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
    odp_port_t in_port = match->flow.in_port.odp_port;

    /* As we select the dpcls based on the port number, each netdev flow
     * belonging to the same dpcls will have the same odp_port value.
     * For performance reasons we wildcard odp_port here in the mask.  In the
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
     * will not be part of the subtable mask.
     * This will speed up the hash computation during dpcls_lookup() because
     * there is one less call to hash_add64() in this case. */
    match->wc.masks.in_port.odp_port = 0;
    netdev_flow_mask_init(&mask, match);
    match->wc.masks.in_port.odp_port = ODPP_NONE;

    /* Make sure wc does not have metadata. */
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));

    /* Do not allocate extra space. */
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
    memset(&flow->stats, 0, sizeof flow->stats);
    flow->dead = false;
    flow->batch = NULL;
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
    ovs_refcount_init(&flow->ref_cnt);
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));

    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);

    /* Select dpcls for in_port. Relies on in_port to be exact match. */
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
    dpcls_insert(cls, &flow->cr, &mask);

    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
                dp_netdev_flow_hash(&flow->ufid));

    if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
        struct ds ds = DS_EMPTY_INITIALIZER;
        struct ofpbuf key_buf, mask_buf;
        struct odp_flow_key_parms odp_parms = {
            .flow = &match->flow,
            .mask = &match->wc.masks,
            .support = dp_netdev_support,
        };

        ofpbuf_init(&key_buf, 0);
        ofpbuf_init(&mask_buf, 0);

        odp_flow_key_from_flow(&odp_parms, &key_buf);
        odp_parms.key_buf = &key_buf;
        odp_flow_key_from_mask(&odp_parms, &mask_buf);

        ds_put_cstr(&ds, "flow_add: ");
        odp_format_ufid(ufid, &ds);
        ds_put_cstr(&ds, " ");
        odp_flow_format(key_buf.data, key_buf.size,
                        mask_buf.data, mask_buf.size,
                        NULL, &ds, false);
        ds_put_cstr(&ds, ", actions:");
        format_odp_actions(&ds, actions, actions_len);

        VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));

        ofpbuf_uninit(&key_buf);
        ofpbuf_uninit(&mask_buf);
        ds_destroy(&ds);
    }

    return flow;
}

static int
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_flow *netdev_flow;
    struct netdev_flow_key key;
    struct dp_netdev_pmd_thread *pmd;
    struct match match;
    ovs_u128 ufid;
    unsigned pmd_id = put->pmd_id == PMD_ID_NULL
                      ? NON_PMD_CORE_ID : put->pmd_id;
    int error;

    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
    if (error) {
        return error;
    }
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
                                          put->mask, put->mask_len,
                                          &match.flow, &match.wc);
    if (error) {
        return error;
    }

    pmd = dp_netdev_get_pmd(dp, pmd_id);
    if (!pmd) {
        return EINVAL;
    }

    /* Must produce a netdev_flow_key for lookup.
     * This interface is no longer performance critical, since it is not used
     * for upcall processing any more. */
    netdev_flow_key_from_flow(&key, &match.flow);

    if (put->ufid) {
        ufid = *put->ufid;
    } else {
        dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
    }

    ovs_mutex_lock(&pmd->flow_mutex);
    netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key, NULL);
    if (!netdev_flow) {
        if (put->flags & DPIF_FP_CREATE) {
            if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
                if (put->stats) {
                    memset(put->stats, 0, sizeof *put->stats);
                }
                dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
                                   put->actions_len);
                error = 0;
            } else {
                error = EFBIG;
            }
        } else {
            error = ENOENT;
        }
    } else {
        if (put->flags & DPIF_FP_MODIFY
            && flow_equal(&match.flow, &netdev_flow->flow)) {
            struct dp_netdev_actions *new_actions;
            struct dp_netdev_actions *old_actions;

            new_actions = dp_netdev_actions_create(put->actions,
                                                   put->actions_len);

            old_actions = dp_netdev_flow_get_actions(netdev_flow);
            ovsrcu_set(&netdev_flow->actions, new_actions);

            if (put->stats) {
                get_dpif_flow_stats(netdev_flow, put->stats);
            }
            if (put->flags & DPIF_FP_ZERO_STATS) {
                /* XXX: The userspace datapath uses thread local statistics
                 * (for flows), which should be updated only by the owning
                 * thread.  Since we cannot write on stats memory here,
                 * we choose not to support this flag.  Please note:
                 * - This feature is currently used only by dpctl commands with
                 *   option --clear.
                 * - Should the need arise, this operation can be implemented
                 *   by keeping a base value (to be update here) for each
                 *   counter, and subtracting it before outputting the stats */
                error = EOPNOTSUPP;
            }

            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
        } else if (put->flags & DPIF_FP_CREATE) {
            error = EEXIST;
        } else {
            /* Overlapping flow. */
            error = EINVAL;
        }
    }
    ovs_mutex_unlock(&pmd->flow_mutex);
    dp_netdev_pmd_unref(pmd);

    return error;
}

static int
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_flow *netdev_flow;
    struct dp_netdev_pmd_thread *pmd;
    unsigned pmd_id = del->pmd_id == PMD_ID_NULL
                      ? NON_PMD_CORE_ID : del->pmd_id;
    int error = 0;

    pmd = dp_netdev_get_pmd(dp, pmd_id);
    if (!pmd) {
        return EINVAL;
    }

    ovs_mutex_lock(&pmd->flow_mutex);
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
                                          del->key_len);
    if (netdev_flow) {
        if (del->stats) {
            get_dpif_flow_stats(netdev_flow, del->stats);
        }
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
    } else {
        error = ENOENT;
    }
    ovs_mutex_unlock(&pmd->flow_mutex);
    dp_netdev_pmd_unref(pmd);

    return error;
}

struct dpif_netdev_flow_dump {
    struct dpif_flow_dump up;
    struct cmap_position poll_thread_pos;
    struct cmap_position flow_pos;
    struct dp_netdev_pmd_thread *cur_pmd;
    int status;
    struct ovs_mutex mutex;
};

static struct dpif_netdev_flow_dump *
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
{
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
}

static struct dpif_flow_dump *
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
{
    struct dpif_netdev_flow_dump *dump;

    dump = xzalloc(sizeof *dump);
    dpif_flow_dump_init(&dump->up, dpif_);
    dump->up.terse = terse;
    ovs_mutex_init(&dump->mutex);

    return &dump->up;
}

static int
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
{
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);

    ovs_mutex_destroy(&dump->mutex);
    free(dump);
    return 0;
}

struct dpif_netdev_flow_dump_thread {
    struct dpif_flow_dump_thread up;
    struct dpif_netdev_flow_dump *dump;
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
};

static struct dpif_netdev_flow_dump_thread *
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
{
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
}

static struct dpif_flow_dump_thread *
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
{
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
    struct dpif_netdev_flow_dump_thread *thread;

    thread = xmalloc(sizeof *thread);
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
    thread->dump = dump;
    return &thread->up;
}

static void
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
{
    struct dpif_netdev_flow_dump_thread *thread
        = dpif_netdev_flow_dump_thread_cast(thread_);

    free(thread);
}

static int
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
                           struct dpif_flow *flows, int max_flows)
{
    struct dpif_netdev_flow_dump_thread *thread
        = dpif_netdev_flow_dump_thread_cast(thread_);
    struct dpif_netdev_flow_dump *dump = thread->dump;
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
    int n_flows = 0;
    int i;

    ovs_mutex_lock(&dump->mutex);
    if (!dump->status) {
        struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
        struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);

        /* First call to dump_next(), extracts the first pmd thread.
         * If there is no pmd thread, returns immediately. */
        if (!pmd) {
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
            if (!pmd) {
                ovs_mutex_unlock(&dump->mutex);
                return n_flows;

            }
        }

        do {
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
                struct cmap_node *node;

                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
                if (!node) {
                    break;
                }
                netdev_flows[n_flows] = CONTAINER_OF(node,
                                                     struct dp_netdev_flow,
                                                     node);
            }
            /* When finishing dumping the current pmd thread, moves to
             * the next. */
            if (n_flows < flow_limit) {
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
                dp_netdev_pmd_unref(pmd);
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
                if (!pmd) {
                    dump->status = EOF;
                    break;
                }
            }
            /* Keeps the reference to next caller. */
            dump->cur_pmd = pmd;

            /* If the current dump is empty, do not exit the loop, since the
             * remaining pmds could have flows to be dumped.  Just dumps again
             * on the new 'pmd'. */
        } while (!n_flows);
    }
    ovs_mutex_unlock(&dump->mutex);

    for (i = 0; i < n_flows; i++) {
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
        struct dpif_flow *f = &flows[i];
        struct ofpbuf key, mask;

        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
        dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
                                    dump->up.terse);
    }

    return n_flows;
}

static int
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;
    struct dp_packet_batch pp;

    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
        dp_packet_size(execute->packet) > UINT16_MAX) {
        return EINVAL;
    }

    /* Tries finding the 'pmd'.  If NULL is returned, that means
     * the current thread is a non-pmd thread and should use
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
    pmd = ovsthread_getspecific(dp->per_pmd_key);
    if (!pmd) {
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
        if (!pmd) {
            return EBUSY;
        }
    }

    /* If the current thread is non-pmd thread, acquires
     * the 'non_pmd_mutex'. */
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
    }

    /* The action processing expects the RSS hash to be valid, because
     * it's always initialized at the beginning of datapath processing.
     * In this case, though, 'execute->packet' may not have gone through
     * the datapath at all, it may have been generated by the upper layer
     * (OpenFlow packet-out, BFD frame, ...). */
    if (!dp_packet_rss_valid(execute->packet)) {
        dp_packet_set_rss_hash(execute->packet,
                               flow_hash_5tuple(execute->flow, 0));
    }

    packet_batch_init_packet(&pp, execute->packet);
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
                              execute->actions, execute->actions_len,
                              time_msec());

    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_unlock(&dp->non_pmd_mutex);
        dp_netdev_pmd_unref(pmd);
    }

    return 0;
}

static void
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
{
    size_t i;

    for (i = 0; i < n_ops; i++) {
        struct dpif_op *op = ops[i];

        switch (op->type) {
        case DPIF_OP_FLOW_PUT:
            op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
            break;

        case DPIF_OP_FLOW_DEL:
            op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
            break;

        case DPIF_OP_EXECUTE:
            op->error = dpif_netdev_execute(dpif, &op->u.execute);
            break;

        case DPIF_OP_FLOW_GET:
            op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
            break;
        }
    }
}

/* Changes the number or the affinity of pmd threads.  The changes are actually
 * applied in dpif_netdev_run(). */
static int
dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
        free(dp->pmd_cmask);
        dp->pmd_cmask = nullable_xstrdup(cmask);
        dp_netdev_request_reconfigure(dp);
    }

    return 0;
}

/* Parses affinity list and returns result in 'core_ids'. */
static int
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
{
    unsigned i;
    char *list, *copy, *key, *value;
    int error = 0;

    for (i = 0; i < n_rxq; i++) {
        core_ids[i] = OVS_CORE_UNSPEC;
    }

    if (!affinity_list) {
        return 0;
    }

    list = copy = xstrdup(affinity_list);

    while (ofputil_parse_key_value(&list, &key, &value)) {
        int rxq_id, core_id;

        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
            error = EINVAL;
            break;
        }

        if (rxq_id < n_rxq) {
            core_ids[rxq_id] = core_id;
        }
    }

    free(copy);
    return error;
}

/* Parses 'affinity_list' and applies configuration if it is valid. */
static int
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
                                  const char *affinity_list)
{
    unsigned *core_ids, i;
    int error = 0;

    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
        error = EINVAL;
        goto exit;
    }

    for (i = 0; i < port->n_rxq; i++) {
        port->rxqs[i].core_id = core_ids[i];
    }

exit:
    free(core_ids);
    return error;
}

/* Changes the affinity of port's rx queues.  The changes are actually applied
 * in dpif_netdev_run(). */
static int
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
                            const struct smap *cfg)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error = 0;
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_number(dp, port_no, &port);
    if (error || !netdev_is_pmd(port->netdev)
        || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
        goto unlock;
    }

    error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
    if (error) {
        goto unlock;
    }
    free(port->rxq_affinity_list);
    port->rxq_affinity_list = nullable_xstrdup(affinity_list);

    dp_netdev_request_reconfigure(dp);
unlock:
    ovs_mutex_unlock(&dp->port_mutex);
    return error;
}

static int
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
                              uint32_t queue_id, uint32_t *priority)
{
    *priority = queue_id;
    return 0;
}


/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
struct dp_netdev_actions *
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
{
    struct dp_netdev_actions *netdev_actions;

    netdev_actions = xmalloc(sizeof *netdev_actions + size);
    memcpy(netdev_actions->actions, actions, size);
    netdev_actions->size = size;

    return netdev_actions;
}

struct dp_netdev_actions *
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
{
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
}

static void
dp_netdev_actions_free(struct dp_netdev_actions *actions)
{
    free(actions);
}

static inline unsigned long long
cycles_counter(void)
{
#ifdef DPDK_NETDEV
    return rte_get_tsc_cycles();
#else
    return 0;
#endif
}

/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
extern struct ovs_mutex cycles_counter_fake_mutex;

/* Start counting cycles.  Must be followed by 'cycles_count_end()' */
static inline void
cycles_count_start(struct dp_netdev_pmd_thread *pmd)
    OVS_ACQUIRES(&cycles_counter_fake_mutex)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    pmd->last_cycles = cycles_counter();
}

/* Stop counting cycles and add them to the counter 'type' */
static inline void
cycles_count_end(struct dp_netdev_pmd_thread *pmd,
                 enum pmd_cycles_counter_type type)
    OVS_RELEASES(&cycles_counter_fake_mutex)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    unsigned long long interval = cycles_counter() - pmd->last_cycles;

    non_atomic_ullong_add(&pmd->cycles.n[type], interval);
}

static void
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
                           struct dp_netdev_port *port,
                           struct netdev_rxq *rxq)
{
    struct dp_packet_batch batch;
    int error;

    dp_packet_batch_init(&batch);
    cycles_count_start(pmd);
    error = netdev_rxq_recv(rxq, &batch);
    cycles_count_end(pmd, PMD_CYCLES_POLLING);
    if (!error) {
        *recirc_depth_get() = 0;

        cycles_count_start(pmd);
        dp_netdev_input(pmd, &batch, port->port_no);
        cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
    } else if (error != EAGAIN && error != EOPNOTSUPP) {
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

        VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
                    netdev_get_name(port->netdev), ovs_strerror(error));
    }
}

static int
port_reconfigure(struct dp_netdev_port *port)
{
    struct netdev *netdev = port->netdev;
    int i, err;

    if (!netdev_is_reconf_required(netdev)) {
        return 0;
    }

    /* Closes the existing 'rxq's. */
    for (i = 0; i < port->n_rxq; i++) {
        netdev_rxq_close(port->rxqs[i].rxq);
        port->rxqs[i].rxq = NULL;
    }
    port->n_rxq = 0;

    /* Allows 'netdev' to apply the pending configuration changes. */
    err = netdev_reconfigure(netdev);
    if (err && (err != EOPNOTSUPP)) {
        VLOG_ERR("Failed to set interface %s new configuration",
                 netdev_get_name(netdev));
        return err;
    }
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
    port->rxqs = xrealloc(port->rxqs,
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
    /* Realloc 'used' counters for tx queues. */
    free(port->txq_used);
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);

    for (i = 0; i < netdev_n_rxq(netdev); i++) {
        err = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
        if (err) {
            return err;
        }
        port->n_rxq++;
    }

    /* Parse affinity list to apply configuration for new queues. */
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);

    return 0;
}

static void
reconfigure_pmd_threads(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port, *next;
    int n_cores;

    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);

    dp_netdev_destroy_all_pmds(dp);

    /* Reconfigures the cpu mask. */
    ovs_numa_set_cpu_mask(dp->pmd_cmask);

    n_cores = ovs_numa_get_n_cores();
    if (n_cores == OVS_CORE_UNSPEC) {
        VLOG_ERR("Cannot get cpu core info");
        return;
    }

    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
        int err;

        err = port_reconfigure(port);
        if (err) {
            hmap_remove(&dp->ports, &port->node);
            seq_change(dp->port_seq);
            port_destroy(port);
        } else {
            port->dynamic_txqs = netdev_n_txq(port->netdev) < n_cores + 1;
        }
    }
    /* Restores the non-pmd. */
    dp_netdev_set_nonpmd(dp);
    /* Restores all pmd threads. */
    dp_netdev_reset_pmd_threads(dp);
}

/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
static bool
ports_require_restart(const struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_reconf_required(port->netdev)) {
            return true;
        }
    }

    return false;
}

/* Return true if needs to revalidate datapath flows. */
static bool
dpif_netdev_run(struct dpif *dpif)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *non_pmd;
    uint64_t new_tnl_seq;

    ovs_mutex_lock(&dp->port_mutex);
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
    if (non_pmd) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
        HMAP_FOR_EACH (port, node, &dp->ports) {
            if (!netdev_is_pmd(port->netdev)) {
                int i;

                for (i = 0; i < port->n_rxq; i++) {
                    dp_netdev_process_rxq_port(non_pmd, port,
                                               port->rxqs[i].rxq);
                }
            }
        }
        dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
        ovs_mutex_unlock(&dp->non_pmd_mutex);

        dp_netdev_pmd_unref(non_pmd);
    }

    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
        reconfigure_pmd_threads(dp);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    tnl_neigh_cache_run();
    tnl_port_map_run();
    new_tnl_seq = seq_read(tnl_conf_seq);

    if (dp->last_tnl_conf_seq != new_tnl_seq) {
        dp->last_tnl_conf_seq = new_tnl_seq;
        return true;
    }
    return false;
}

static void
dpif_netdev_wait(struct dpif *dpif)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp = get_dp_netdev(dpif);

    ovs_mutex_lock(&dp_netdev_mutex);
    ovs_mutex_lock(&dp->port_mutex);
    HMAP_FOR_EACH (port, node, &dp->ports) {
        netdev_wait_reconf_required(port->netdev);
        if (!netdev_is_pmd(port->netdev)) {
            int i;

            for (i = 0; i < port->n_rxq; i++) {
                netdev_rxq_wait(port->rxqs[i].rxq);
            }
        }
    }
    ovs_mutex_unlock(&dp->port_mutex);
    ovs_mutex_unlock(&dp_netdev_mutex);
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
}

static void
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
{
    struct tx_port *tx_port_cached;

    /* Free all used tx queue ids. */
    dpif_netdev_xps_revalidate_pmd(pmd, 0, true);

    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
        free(tx_port_cached);
    }
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
        free(tx_port_cached);
    }
}

/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
 * 'pmd->port_cache' (thread local) */
static void
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
    OVS_REQUIRES(pmd->port_mutex)
{
    struct tx_port *tx_port, *tx_port_cached;

    pmd_free_cached_ports(pmd);
    hmap_shrink(&pmd->send_port_cache);
    hmap_shrink(&pmd->tnl_port_cache);

    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
                        hash_port_no(tx_port_cached->port->port_no));
        }

        if (netdev_n_txq(tx_port->port->netdev)) {
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
                        hash_port_no(tx_port_cached->port->port_no));
        }
    }
}

static int
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
                          struct rxq_poll **ppoll_list)
{
    struct rxq_poll *poll_list = *ppoll_list;
    struct rxq_poll *poll;
    int i;

    ovs_mutex_lock(&pmd->port_mutex);
    poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);

    i = 0;
    LIST_FOR_EACH (poll, node, &pmd->poll_list) {
        poll_list[i++] = *poll;
    }

    pmd_load_cached_ports(pmd);

    ovs_mutex_unlock(&pmd->port_mutex);

    *ppoll_list = poll_list;
    return i;
}

static void *
pmd_thread_main(void *f_)
{
    struct dp_netdev_pmd_thread *pmd = f_;
    unsigned int lc = 0;
    struct rxq_poll *poll_list;
    unsigned int port_seq = PMD_INITIAL_SEQ;
    bool exiting;
    int poll_cnt;
    int i;

    poll_list = NULL;

    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
    ovs_numa_thread_setaffinity_core(pmd->core_id);
    dpdk_set_lcore_id(pmd->core_id);
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
reload:
    emc_cache_init(&pmd->flow_cache);

    /* List port/core affinity */
    for (i = 0; i < poll_cnt; i++) {
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
                pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
                netdev_rxq_get_queue_id(poll_list[i].rx));
    }

    for (;;) {
        for (i = 0; i < poll_cnt; i++) {
            dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
        }

        if (lc++ > 1024) {
            unsigned int seq;

            lc = 0;

            coverage_try_clear();
            dp_netdev_pmd_try_optimize(pmd);
            if (!ovsrcu_try_quiesce()) {
                emc_cache_slow_sweep(&pmd->flow_cache);
            }

            atomic_read_relaxed(&pmd->change_seq, &seq);
            if (seq != port_seq) {
                port_seq = seq;
                break;
            }
        }
    }

    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
    exiting = latch_is_set(&pmd->exit_latch);
    /* Signal here to make sure the pmd finishes
     * reloading the updated configuration. */
    dp_netdev_pmd_reload_done(pmd);

    emc_cache_uninit(&pmd->flow_cache);

    if (!exiting) {
        goto reload;
    }

    free(poll_list);
    pmd_free_cached_ports(pmd);
    return NULL;
}

static void
dp_netdev_disable_upcall(struct dp_netdev *dp)
    OVS_ACQUIRES(dp->upcall_rwlock)
{
    fat_rwlock_wrlock(&dp->upcall_rwlock);
}

static void
dpif_netdev_disable_upcall(struct dpif *dpif)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp_netdev_disable_upcall(dp);
}

static void
dp_netdev_enable_upcall(struct dp_netdev *dp)
    OVS_RELEASES(dp->upcall_rwlock)
{
    fat_rwlock_unlock(&dp->upcall_rwlock);
}

static void
dpif_netdev_enable_upcall(struct dpif *dpif)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp_netdev_enable_upcall(dp);
}

static void
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
{
    ovs_mutex_lock(&pmd->cond_mutex);
    xpthread_cond_signal(&pmd->cond);
    ovs_mutex_unlock(&pmd->cond_mutex);
}

/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
 * 'core_id' is NON_PMD_CORE_ID).
 *
 * Caller must unrefs the returned reference.  */
static struct dp_netdev_pmd_thread *
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
{
    struct dp_netdev_pmd_thread *pmd;
    const struct cmap_node *pnode;

    pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
    if (!pnode) {
        return NULL;
    }
    pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);

    return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
}

/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
static void
dp_netdev_set_nonpmd(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_pmd_thread *non_pmd;
    struct dp_netdev_port *port;

    non_pmd = xzalloc(sizeof *non_pmd);
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);

    HMAP_FOR_EACH (port, node, &dp->ports) {
        dp_netdev_add_port_tx_to_pmd(non_pmd, port);
    }

    dp_netdev_reload_pmd__(non_pmd);
}

/* Caller must have valid pointer to 'pmd'. */
static bool
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
{
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
}

static void
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
{
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
    }
}

/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
 * fails, keeps checking for next node until reaching the end of cmap.
 *
 * Caller must unrefs the returned reference. */
static struct dp_netdev_pmd_thread *
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
{
    struct dp_netdev_pmd_thread *next;

    do {
        struct cmap_node *node;

        node = cmap_next_position(&dp->poll_threads, pos);
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
            : NULL;
    } while (next && !dp_netdev_pmd_try_ref(next));

    return next;
}

/* Configures the 'pmd' based on the input argument. */
static void
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
                        unsigned core_id, int numa_id)
{
    pmd->dp = dp;
    pmd->core_id = core_id;
    pmd->numa_id = numa_id;
    pmd->poll_cnt = 0;

    atomic_init(&pmd->static_tx_qid,
                (core_id == NON_PMD_CORE_ID)
                ? ovs_numa_get_n_cores()
                : get_n_pmd_threads(dp));

    ovs_refcount_init(&pmd->ref_cnt);
    latch_init(&pmd->exit_latch);
    atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
    xpthread_cond_init(&pmd->cond, NULL);
    ovs_mutex_init(&pmd->cond_mutex);
    ovs_mutex_init(&pmd->flow_mutex);
    ovs_mutex_init(&pmd->port_mutex);
    cmap_init(&pmd->flow_table);
    cmap_init(&pmd->classifiers);
    pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
    ovs_list_init(&pmd->poll_list);
    hmap_init(&pmd->tx_ports);
    hmap_init(&pmd->tnl_port_cache);
    hmap_init(&pmd->send_port_cache);
    /* init the 'flow_cache' since there is no
     * actual thread created for NON_PMD_CORE_ID. */
    if (core_id == NON_PMD_CORE_ID) {
        emc_cache_init(&pmd->flow_cache);
    }
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
                hash_int(core_id, 0));
}

static void
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
{
    struct dpcls *cls;

    dp_netdev_pmd_flow_flush(pmd);
    hmap_destroy(&pmd->send_port_cache);
    hmap_destroy(&pmd->tnl_port_cache);
    hmap_destroy(&pmd->tx_ports);
    /* All flows (including their dpcls_rules) have been deleted already */
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
        dpcls_destroy(cls);
        ovsrcu_postpone(free, cls);
    }
    cmap_destroy(&pmd->classifiers);
    cmap_destroy(&pmd->flow_table);
    ovs_mutex_destroy(&pmd->flow_mutex);
    latch_destroy(&pmd->exit_latch);
    xpthread_cond_destroy(&pmd->cond);
    ovs_mutex_destroy(&pmd->cond_mutex);
    ovs_mutex_destroy(&pmd->port_mutex);
    free(pmd);
}

/* Stops the pmd thread, removes it from the 'dp->poll_threads',
 * and unrefs the struct. */
static void
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
{
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
     * but extra cleanup is necessary */
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
        emc_cache_uninit(&pmd->flow_cache);
        pmd_free_cached_ports(pmd);
        ovs_mutex_unlock(&dp->non_pmd_mutex);
    } else {
        latch_set(&pmd->exit_latch);
        dp_netdev_reload_pmd__(pmd);
        ovs_numa_unpin_core(pmd->core_id);
        xpthread_join(pmd->thread, NULL);
    }

    dp_netdev_pmd_clear_ports(pmd);

    /* Purges the 'pmd''s flows after stopping the thread, but before
     * destroying the flows, so that the flow stats can be collected. */
    if (dp->dp_purge_cb) {
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
    }
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
    dp_netdev_pmd_unref(pmd);
}

/* Destroys all pmd threads. */
static void
dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
{
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_pmd_thread **pmd_list;
    size_t k = 0, n_pmds;

    n_pmds = cmap_count(&dp->poll_threads);
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        /* We cannot call dp_netdev_del_pmd(), since it alters
         * 'dp->poll_threads' (while we're iterating it) and it
         * might quiesce. */
        ovs_assert(k < n_pmds);
        pmd_list[k++] = pmd;
    }

    for (size_t i = 0; i < k; i++) {
        dp_netdev_del_pmd(dp, pmd_list[i]);
    }
    free(pmd_list);
}

/* Deletes all pmd threads on numa node 'numa_id' and
 * fixes static_tx_qids of other threads to keep them sequential. */
static void
dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
{
    struct dp_netdev_pmd_thread *pmd;
    int n_pmds_on_numa, n_pmds;
    int *free_idx, k = 0;
    struct dp_netdev_pmd_thread **pmd_list;

    n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
    free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
    pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        /* We cannot call dp_netdev_del_pmd(), since it alters
         * 'dp->poll_threads' (while we're iterating it) and it
         * might quiesce. */
        if (pmd->numa_id == numa_id && pmd->core_id != NON_PMD_CORE_ID) {
            atomic_read_relaxed(&pmd->static_tx_qid, &free_idx[k]);
            pmd_list[k] = pmd;
            ovs_assert(k < n_pmds_on_numa);
            k++;
        }
    }

    for (int i = 0; i < k; i++) {
        dp_netdev_del_pmd(dp, pmd_list[i]);
    }

    n_pmds = get_n_pmd_threads(dp);
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        int old_tx_qid;

        atomic_read_relaxed(&pmd->static_tx_qid, &old_tx_qid);

        if (old_tx_qid >= n_pmds && pmd->core_id != NON_PMD_CORE_ID) {
            int new_tx_qid = free_idx[--k];

            atomic_store_relaxed(&pmd->static_tx_qid, new_tx_qid);
        }
    }

    free(pmd_list);
    free(free_idx);
}

/* Deletes all rx queues from pmd->poll_list and all the ports from
 * pmd->tx_ports. */
static void
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
{
    struct rxq_poll *poll;
    struct tx_port *port;

    ovs_mutex_lock(&pmd->port_mutex);
    LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
        free(poll);
    }
    pmd->poll_cnt = 0;
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
        free(port);
    }
    ovs_mutex_unlock(&pmd->port_mutex);
}

static struct tx_port *
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
{
    struct tx_port *tx;

    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
        if (tx->port->port_no == port_no) {
            return tx;
        }
    }

    return NULL;
}

/* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
 * 'tx_ports' of 'pmd' thread.  Returns true if 'port' was found in 'pmd'
 * (therefore a restart is required). */
static bool
dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
                              struct dp_netdev_pmd_thread *pmd)
{
    struct rxq_poll *poll, *next;
    struct tx_port *tx;
    bool found = false;

    ovs_mutex_lock(&pmd->port_mutex);
    LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
        if (poll->port == port) {
            found = true;
            ovs_list_remove(&poll->node);
            pmd->poll_cnt--;
            free(poll);
        }
    }

    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
    if (tx) {
        hmap_remove(&pmd->tx_ports, &tx->node);
        free(tx);
        found = true;
    }
    ovs_mutex_unlock(&pmd->port_mutex);

    return found;
}

/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
 * threads.  The pmd threads that need to be restarted are inserted in
 * 'to_reload'. */
static void
dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
                                   struct dp_netdev_port *port,
                                   struct hmapx *to_reload)
{
    struct dp_netdev_pmd_thread *pmd;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        bool found;

        found = dp_netdev_del_port_from_pmd__(port, pmd);

        if (found) {
            hmapx_add(to_reload, pmd);
        }
    }
}

/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
 * threads. Reloads the threads if needed. */
static void
dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
                                 struct dp_netdev_port *port)
{
    struct dp_netdev_pmd_thread *pmd;
    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
    struct hmapx_node *node;

    dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);

    HMAPX_FOR_EACH (node, &to_reload) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        dp_netdev_reload_pmd__(pmd);
    }

    hmapx_destroy(&to_reload);
}


/* Returns non-isolated PMD thread from this numa node with fewer
 * rx queues to poll. Returns NULL if there is no non-isolated  PMD threads
 * on this numa node. Can be called safely only by main thread. */
static struct dp_netdev_pmd_thread *
dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
{
    int min_cnt = -1;
    struct dp_netdev_pmd_thread *pmd, *res = NULL;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (!pmd->isolated && pmd->numa_id == numa_id
            && (min_cnt > pmd->poll_cnt || res == NULL)) {
            min_cnt = pmd->poll_cnt;
            res = pmd;
        }
    }

    return res;
}

/* Adds rx queue to poll_list of PMD thread. */
static void
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
                         struct dp_netdev_port *port, struct netdev_rxq *rx)
    OVS_REQUIRES(pmd->port_mutex)
{
    struct rxq_poll *poll = xmalloc(sizeof *poll);

    poll->port = port;
    poll->rx = rx;

    ovs_list_push_back(&pmd->poll_list, &poll->node);
    pmd->poll_cnt++;
}

/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
 * changes to take effect. */
static void
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
                             struct dp_netdev_port *port)
{
    struct tx_port *tx;

    tx = xzalloc(sizeof *tx);

    tx->port = port;
    tx->qid = -1;

    ovs_mutex_lock(&pmd->port_mutex);
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
    ovs_mutex_unlock(&pmd->port_mutex);
}

/* Distribute all {pinned|non-pinned} rx queues of 'port' between PMD
 * threads in 'dp'. The pmd threads that need to be restarted are inserted
 * in 'to_reload'. PMD threads with pinned queues marked as isolated. */
static void
dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
                              struct dp_netdev_port *port,
                              struct hmapx *to_reload, bool pinned)
{
    int numa_id = netdev_get_numa_id(port->netdev);
    struct dp_netdev_pmd_thread *pmd;
    int i;

    if (!netdev_is_pmd(port->netdev)) {
        return;
    }

    for (i = 0; i < port->n_rxq; i++) {
        if (pinned) {
            if (port->rxqs[i].core_id == OVS_CORE_UNSPEC) {
                continue;
            }
            pmd = dp_netdev_get_pmd(dp, port->rxqs[i].core_id);
            if (!pmd) {
                VLOG_WARN("There is no PMD thread on core %d. "
                          "Queue %d on port \'%s\' will not be polled.",
                          port->rxqs[i].core_id, i,
                          netdev_get_name(port->netdev));
                continue;
            }
            pmd->isolated = true;
            dp_netdev_pmd_unref(pmd);
        } else {
            if (port->rxqs[i].core_id != OVS_CORE_UNSPEC) {
                continue;
            }
            pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
            if (!pmd) {
                VLOG_WARN("There's no available pmd thread on numa node %d",
                          numa_id);
                break;
            }
        }

        ovs_mutex_lock(&pmd->port_mutex);
        dp_netdev_add_rxq_to_pmd(pmd, port, port->rxqs[i].rxq);
        ovs_mutex_unlock(&pmd->port_mutex);

        hmapx_add(to_reload, pmd);
    }
}

/* Distributes all non-pinned rx queues of 'port' between all PMD threads
 * in 'dp' and inserts 'port' in the PMD threads 'tx_ports'. The pmd threads
 * that need to be restarted are inserted in 'to_reload'. */
static void
dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
                             struct hmapx *to_reload)
{
    struct dp_netdev_pmd_thread *pmd;

    dp_netdev_add_port_rx_to_pmds(dp, port, to_reload, false);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        dp_netdev_add_port_tx_to_pmd(pmd, port);
        hmapx_add(to_reload, pmd);
    }
}

/* Distributes all non-pinned rx queues of 'port' between all PMD threads
 * in 'dp', inserts 'port' in the PMD threads 'tx_ports' and reloads them,
 * if needed. */
static void
dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
{
    struct dp_netdev_pmd_thread *pmd;
    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
    struct hmapx_node *node;

    dp_netdev_add_port_to_pmds__(dp, port, &to_reload);

    HMAPX_FOR_EACH (node, &to_reload) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        dp_netdev_reload_pmd__(pmd);
    }

    hmapx_destroy(&to_reload);
}

/* Starts pmd threads for the numa node 'numa_id', if not already started.
 * The function takes care of filling the threads tx port cache. */
static void
dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
    OVS_REQUIRES(dp->port_mutex)
{
    int n_pmds;

    if (!ovs_numa_numa_id_is_valid(numa_id)) {
        VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
                  numa_id);
        return;
    }

    n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);

    /* If there are already pmd threads created for the numa node
     * in which 'netdev' is on, do nothing.  Else, creates the
     * pmd threads for the numa node. */
    if (!n_pmds) {
        int can_have, n_unpinned, i;

        n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
        if (!n_unpinned) {
            VLOG_WARN("Cannot create pmd threads due to out of unpinned "
                      "cores on numa node %d", numa_id);
            return;
        }

        /* If cpu mask is specified, uses all unpinned cores, otherwise
         * tries creating NR_PMD_THREADS pmd threads. */
        can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
        for (i = 0; i < can_have; i++) {
            unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
            struct dp_netdev_port *port;

            dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);

            HMAP_FOR_EACH (port, node, &dp->ports) {
                dp_netdev_add_port_tx_to_pmd(pmd, port);
            }

            pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
        }
        VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
    }
}


/* Called after pmd threads config change.  Restarts pmd threads with
 * new configuration. */
static void
dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_port *port;
    struct hmapx_node *node;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_pmd(port->netdev)) {
            struct hmapx numas = HMAPX_INITIALIZER(&numas);
            struct hmapx_node *numa_node;
            uintptr_t numa_id;
            int i;

            numa_id = netdev_get_numa_id(port->netdev);
            hmapx_add(&numas, (void *) numa_id);
            for (i = 0; i < port->n_rxq; i++) {
                unsigned core_id = port->rxqs[i].core_id;

                if (core_id != OVS_CORE_UNSPEC) {
                    numa_id = ovs_numa_get_numa_id(core_id);
                    hmapx_add(&numas, (void *) numa_id);
                }
            }

            HMAPX_FOR_EACH (numa_node, &numas) {
                dp_netdev_set_pmds_on_numa(dp, (uintptr_t) numa_node->data);
            }

            hmapx_destroy(&numas);
        }
        /* Distribute only pinned rx queues first to mark threads as isolated */
        dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, true);
    }

    /* Distribute remaining non-pinned rx queues to non-isolated PMD threads. */
    HMAP_FOR_EACH (port, node, &dp->ports) {
        dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, false);
    }

    HMAPX_FOR_EACH (node, &to_reload) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        dp_netdev_reload_pmd__(pmd);
    }

    hmapx_destroy(&to_reload);
}

static char *
dpif_netdev_get_datapath_version(void)
{
     return xstrdup("<built-in>");
}

static void
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
                    uint16_t tcp_flags, long long now)
{
    uint16_t flags;

    atomic_store_relaxed(&netdev_flow->stats.used, now);
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
    flags |= tcp_flags;
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
}

static void
dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
                       enum dp_stat_type type, int cnt)
{
    non_atomic_ullong_add(&pmd->stats.n[type], cnt);
}

static int
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
                 enum dpif_upcall_type type, const struct nlattr *userdata,
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
{
    struct dp_netdev *dp = pmd->dp;

    if (OVS_UNLIKELY(!dp->upcall_cb)) {
        return ENODEV;
    }

    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
        struct ds ds = DS_EMPTY_INITIALIZER;
        char *packet_str;
        struct ofpbuf key;
        struct odp_flow_key_parms odp_parms = {
            .flow = flow,
            .mask = wc ? &wc->masks : NULL,
            .support = dp_netdev_support,
        };

        ofpbuf_init(&key, 0);
        odp_flow_key_from_flow(&odp_parms, &key);
        packet_str = ofp_packet_to_string(dp_packet_data(packet_),
                                          dp_packet_size(packet_));

        odp_flow_key_format(key.data, key.size, &ds);

        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);

        ofpbuf_uninit(&key);
        free(packet_str);

        ds_destroy(&ds);
    }

    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
                         actions, wc, put_actions, dp->upcall_aux);
}

static inline uint32_t
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
                                const struct miniflow *mf)
{
    uint32_t hash, recirc_depth;

    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
        hash = dp_packet_get_rss_hash(packet);
    } else {
        hash = miniflow_hash_5tuple(mf, 0);
        dp_packet_set_rss_hash(packet, hash);
    }

    /* The RSS hash must account for the recirculation depth to avoid
     * collisions in the exact match cache */
    recirc_depth = *recirc_depth_get_unsafe();
    if (OVS_UNLIKELY(recirc_depth)) {
        hash = hash_finish(hash, recirc_depth);
        dp_packet_set_rss_hash(packet, hash);
    }
    return hash;
}

struct packet_batch_per_flow {
    unsigned int byte_count;
    uint16_t tcp_flags;
    struct dp_netdev_flow *flow;

    struct dp_packet_batch array;
};

static inline void
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
                             struct dp_packet *packet,
                             const struct miniflow *mf)
{
    batch->byte_count += dp_packet_size(packet);
    batch->tcp_flags |= miniflow_get_tcp_flags(mf);
    batch->array.packets[batch->array.count++] = packet;
}

static inline void
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
                           struct dp_netdev_flow *flow)
{
    flow->batch = batch;

    batch->flow = flow;
    dp_packet_batch_init(&batch->array);
    batch->byte_count = 0;
    batch->tcp_flags = 0;
}

static inline void
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
                              struct dp_netdev_pmd_thread *pmd,
                              long long now)
{
    struct dp_netdev_actions *actions;
    struct dp_netdev_flow *flow = batch->flow;

    dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
                        batch->tcp_flags, now);

    actions = dp_netdev_flow_get_actions(flow);

    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
                              actions->actions, actions->size, now);
}

static inline void
dp_netdev_queue_batches(struct dp_packet *pkt,
                        struct dp_netdev_flow *flow, const struct miniflow *mf,
                        struct packet_batch_per_flow *batches, size_t *n_batches)
{
    struct packet_batch_per_flow *batch = flow->batch;

    if (OVS_UNLIKELY(!batch)) {
        batch = &batches[(*n_batches)++];
        packet_batch_per_flow_init(batch, flow);
    }

    packet_batch_per_flow_update(batch, pkt, mf);
}

/* Try to process all ('cnt') the 'packets' using only the exact match cache
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
 * miniflow is copied into 'keys' and the packet pointer is moved at the
 * beginning of the 'packets' array.
 *
 * The function returns the number of packets that needs to be processed in the
 * 'packets' array (they have been moved to the beginning of the vector).
 *
 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
 * initialized by this function using 'port_no'.
 */
static inline size_t
emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
               struct netdev_flow_key *keys,
               struct packet_batch_per_flow batches[], size_t *n_batches,
               bool md_is_valid, odp_port_t port_no)
{
    struct emc_cache *flow_cache = &pmd->flow_cache;
    struct netdev_flow_key *key = &keys[0];
    size_t i, n_missed = 0, n_dropped = 0;
    struct dp_packet **packets = packets_->packets;
    int cnt = packets_->count;

    for (i = 0; i < cnt; i++) {
        struct dp_netdev_flow *flow;
        struct dp_packet *packet = packets[i];

        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
            dp_packet_delete(packet);
            n_dropped++;
            continue;
        }

        if (i != cnt - 1) {
            /* Prefetch next packet data and metadata. */
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
            pkt_metadata_prefetch_init(&packets[i+1]->md);
        }

        if (!md_is_valid) {
            pkt_metadata_init(&packet->md, port_no);
        }
        miniflow_extract(packet, &key->mf);
        key->len = 0; /* Not computed yet. */
        key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);

        flow = emc_lookup(flow_cache, key);
        if (OVS_LIKELY(flow)) {
            dp_netdev_queue_batches(packet, flow, &key->mf, batches,
                                    n_batches);
        } else {
            /* Exact match cache missed. Group missed packets together at
             * the beginning of the 'packets' array.  */
            packets[n_missed] = packet;
            /* 'key[n_missed]' contains the key of the current packet and it
             * must be returned to the caller. The next key should be extracted
             * to 'keys[n_missed + 1]'. */
            key = &keys[++n_missed];
        }
    }

    dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);

    return n_missed;
}

static inline void
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
                     const struct netdev_flow_key *key,
                     struct ofpbuf *actions, struct ofpbuf *put_actions,
                     int *lost_cnt, long long now)
{
    struct ofpbuf *add_actions;
    struct dp_packet_batch b;
    struct match match;
    ovs_u128 ufid;
    int error;

    match.tun_md.valid = false;
    miniflow_expand(&key->mf, &match.flow);

    ofpbuf_clear(actions);
    ofpbuf_clear(put_actions);

    dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
                             &ufid, DPIF_UC_MISS, NULL, actions,
                             put_actions);
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
        dp_packet_delete(packet);
        (*lost_cnt)++;
        return;
    }

    /* The Netlink encoding of datapath flow keys cannot express
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
     * tag is interpreted as exact match on the fact that there is no
     * VLAN.  Unless we refactor a lot of code that translates between
     * Netlink and struct flow representations, we have to do the same
     * here. */
    if (!match.wc.masks.vlan_tci) {
        match.wc.masks.vlan_tci = htons(0xffff);
    }

    /* We can't allow the packet batching in the next loop to execute
     * the actions.  Otherwise, if there are any slow path actions,
     * we'll send the packet up twice. */
    packet_batch_init_packet(&b, packet);
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
                              actions->data, actions->size, now);

    add_actions = put_actions->size ? put_actions : actions;
    if (OVS_LIKELY(error != ENOSPC)) {
        struct dp_netdev_flow *netdev_flow;

        /* XXX: There's a race window where a flow covering this packet
         * could have already been installed since we last did the flow
         * lookup before upcall.  This could be solved by moving the
         * mutex lock outside the loop, but that's an awful long time
         * to be locking everyone out of making flow installs.  If we
         * move to a per-core classifier, it would be reasonable. */
        ovs_mutex_lock(&pmd->flow_mutex);
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
        if (OVS_LIKELY(!netdev_flow)) {
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
                                             add_actions->data,
                                             add_actions->size);
        }
        ovs_mutex_unlock(&pmd->flow_mutex);

        emc_insert(&pmd->flow_cache, key, netdev_flow);
    }
}

static inline void
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
                     struct dp_packet_batch *packets_,
                     struct netdev_flow_key *keys,
                     struct packet_batch_per_flow batches[], size_t *n_batches,
                     odp_port_t in_port,
                     long long now)
{
    int cnt = packets_->count;
#if !defined(__CHECKER__) && !defined(_WIN32)
    const size_t PKT_ARRAY_SIZE = cnt;
#else
    /* Sparse or MSVC doesn't like variable length array. */
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
    struct dp_packet **packets = packets_->packets;
    struct dpcls *cls;
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
    struct dp_netdev *dp = pmd->dp;
    struct emc_cache *flow_cache = &pmd->flow_cache;
    int miss_cnt = 0, lost_cnt = 0;
    int lookup_cnt = 0, add_lookup_cnt;
    bool any_miss;
    size_t i;

    for (i = 0; i < cnt; i++) {
        /* Key length is needed in all the cases, hash computed on demand. */
        keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
    }
    /* Get the classifier for the in_port */
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    if (OVS_LIKELY(cls)) {
        any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
    } else {
        any_miss = true;
        memset(rules, 0, sizeof(rules));
    }
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
        struct ofpbuf actions, put_actions;

        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);

        for (i = 0; i < cnt; i++) {
            struct dp_netdev_flow *netdev_flow;

            if (OVS_LIKELY(rules[i])) {
                continue;
            }

            /* It's possible that an earlier slow path execution installed
             * a rule covering this flow.  In this case, it's a lot cheaper
             * to catch it here than execute a miss. */
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
                                                    &add_lookup_cnt);
            if (netdev_flow) {
                lookup_cnt += add_lookup_cnt;
                rules[i] = &netdev_flow->cr;
                continue;
            }

            miss_cnt++;
            handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
                                 &put_actions, &lost_cnt, now);
        }

        ofpbuf_uninit(&actions);
        ofpbuf_uninit(&put_actions);
        fat_rwlock_unlock(&dp->upcall_rwlock);
        dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
    } else if (OVS_UNLIKELY(any_miss)) {
        for (i = 0; i < cnt; i++) {
            if (OVS_UNLIKELY(!rules[i])) {
                dp_packet_delete(packets[i]);
                lost_cnt++;
                miss_cnt++;
            }
        }
    }

    for (i = 0; i < cnt; i++) {
        struct dp_packet *packet = packets[i];
        struct dp_netdev_flow *flow;

        if (OVS_UNLIKELY(!rules[i])) {
            continue;
        }

        flow = dp_netdev_flow_cast(rules[i]);

        emc_insert(flow_cache, &keys[i], flow);
        dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
    }

    dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
    dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
    dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
    dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
}

/* Packets enter the datapath from a port (or from recirculation) here.
 *
 * For performance reasons a caller may choose not to initialize the metadata
 * in 'packets': in this case 'mdinit' is false and this function needs to
 * initialize it using 'port_no'.  If the metadata in 'packets' is already
 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
static void
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                  struct dp_packet_batch *packets,
                  bool md_is_valid, odp_port_t port_no)
{
    int cnt = packets->count;
#if !defined(__CHECKER__) && !defined(_WIN32)
    const size_t PKT_ARRAY_SIZE = cnt;
#else
    /* Sparse or MSVC doesn't like variable length array. */
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct netdev_flow_key keys[PKT_ARRAY_SIZE];
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
    long long now = time_msec();
    size_t newcnt, n_batches, i;
    odp_port_t in_port;

    n_batches = 0;
    newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
                            md_is_valid, port_no);
    if (OVS_UNLIKELY(newcnt)) {
        packets->count = newcnt;
        /* Get ingress port from first packet's metadata. */
        in_port = packets->packets[0]->md.in_port.odp_port;
        fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now);
    }

    /* All the flow batches need to be reset before any call to
     * packet_batch_per_flow_execute() as it could potentially trigger
     * recirculation. When a packet matching flow ‘j’ happens to be
     * recirculated, the nested call to dp_netdev_input__() could potentially
     * classify the packet as matching another flow - say 'k'. It could happen
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
     * already its own batches[k] still waiting to be served.  So if its
     * ‘batch’ member is not reset, the recirculated packet would be wrongly
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
    for (i = 0; i < n_batches; i++) {
        batches[i].flow->batch = NULL;
    }

    for (i = 0; i < n_batches; i++) {
        packet_batch_per_flow_execute(&batches[i], pmd, now);
    }
}

static void
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                struct dp_packet_batch *packets,
                odp_port_t port_no)
{
    dp_netdev_input__(pmd, packets, false, port_no);
}

static void
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet_batch *packets)
{
    dp_netdev_input__(pmd, packets, true, 0);
}

struct dp_netdev_execute_aux {
    struct dp_netdev_pmd_thread *pmd;
    long long now;
    const struct flow *flow;
};

static void
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
                                 void *aux)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp->dp_purge_aux = aux;
    dp->dp_purge_cb = cb;
}

static void
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
                               void *aux)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp->upcall_aux = aux;
    dp->upcall_cb = cb;
}

static void
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                               long long now, bool purge)
{
    struct tx_port *tx;
    struct dp_netdev_port *port;
    long long interval;

    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
        if (!tx->port->dynamic_txqs) {
            continue;
        }
        interval = now - tx->last_used;
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
            port = tx->port;
            ovs_mutex_lock(&port->txq_used_mutex);
            port->txq_used[tx->qid]--;
            ovs_mutex_unlock(&port->txq_used_mutex);
            tx->qid = -1;
        }
    }
}

static int
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
                           struct tx_port *tx, long long now)
{
    struct dp_netdev_port *port;
    long long interval;
    int i, min_cnt, min_qid;

    if (OVS_UNLIKELY(!now)) {
        now = time_msec();
    }

    interval = now - tx->last_used;
    tx->last_used = now;

    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
        return tx->qid;
    }

    port = tx->port;

    ovs_mutex_lock(&port->txq_used_mutex);
    if (tx->qid >= 0) {
        port->txq_used[tx->qid]--;
        tx->qid = -1;
    }

    min_cnt = -1;
    min_qid = 0;
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
            min_cnt = port->txq_used[i];
            min_qid = i;
        }
    }

    port->txq_used[min_qid]++;
    tx->qid = min_qid;

    ovs_mutex_unlock(&port->txq_used_mutex);

    dpif_netdev_xps_revalidate_pmd(pmd, now, false);

    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
    return min_qid;
}

static struct tx_port *
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
                          odp_port_t port_no)
{
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
}

static struct tx_port *
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
                           odp_port_t port_no)
{
    return tx_port_lookup(&pmd->send_port_cache, port_no);
}

static int
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
                const struct nlattr *attr,
                struct dp_packet_batch *batch)
{
    struct tx_port *tun_port;
    const struct ovs_action_push_tnl *data;
    int err;

    data = nl_attr_get(attr);

    tun_port = pmd_tnl_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
    if (!tun_port) {
        err = -EINVAL;
        goto error;
    }
    err = netdev_push_header(tun_port->port->netdev, batch, data);
    if (!err) {
        return 0;
    }
error:
    dp_packet_delete_batch(batch, true);
    return err;
}

static void
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
                            struct dp_packet *packet, bool may_steal,
                            struct flow *flow, ovs_u128 *ufid,
                            struct ofpbuf *actions,
                            const struct nlattr *userdata, long long now)
{
    struct dp_packet_batch b;
    int error;

    ofpbuf_clear(actions);

    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
                             DPIF_UC_ACTION, userdata, actions,
                             NULL);
    if (!error || error == ENOSPC) {
        packet_batch_init_packet(&b, packet);
        dp_netdev_execute_actions(pmd, &b, may_steal, flow,
                                  actions->data, actions->size, now);
    } else if (may_steal) {
        dp_packet_delete(packet);
    }
}

static void
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
              const struct nlattr *a, bool may_steal)
{
    struct dp_netdev_execute_aux *aux = aux_;
    uint32_t *depth = recirc_depth_get();
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
    struct dp_netdev *dp = pmd->dp;
    int type = nl_attr_type(a);
    long long now = aux->now;
    struct tx_port *p;

    switch ((enum ovs_action_attr)type) {
    case OVS_ACTION_ATTR_OUTPUT:
        p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
        if (OVS_LIKELY(p)) {
            int tx_qid;
            bool dynamic_txqs;

            dynamic_txqs = p->port->dynamic_txqs;
            if (dynamic_txqs) {
                tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
            } else {
                atomic_read_relaxed(&pmd->static_tx_qid, &tx_qid);
            }

            netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
                        dynamic_txqs);
            return;
        }
        break;

    case OVS_ACTION_ATTR_TUNNEL_PUSH:
        if (*depth < MAX_RECIRC_DEPTH) {
            struct dp_packet_batch tnl_pkt;
            struct dp_packet_batch *orig_packets_ = packets_;
            int err;

            if (!may_steal) {
                dp_packet_batch_clone(&tnl_pkt, packets_);
                packets_ = &tnl_pkt;
                dp_packet_batch_reset_cutlen(orig_packets_);
            }

            dp_packet_batch_apply_cutlen(packets_);

            err = push_tnl_action(pmd, a, packets_);
            if (!err) {
                (*depth)++;
                dp_netdev_recirculate(pmd, packets_);
                (*depth)--;
            }
            return;
        }
        break;

    case OVS_ACTION_ATTR_TUNNEL_POP:
        if (*depth < MAX_RECIRC_DEPTH) {
            struct dp_packet_batch *orig_packets_ = packets_;
            odp_port_t portno = nl_attr_get_odp_port(a);

            p = pmd_tnl_port_cache_lookup(pmd, portno);
            if (p) {
                struct dp_packet_batch tnl_pkt;
                int i;

                if (!may_steal) {
                    dp_packet_batch_clone(&tnl_pkt, packets_);
                    packets_ = &tnl_pkt;
                    dp_packet_batch_reset_cutlen(orig_packets_);
                }

                dp_packet_batch_apply_cutlen(packets_);

                netdev_pop_header(p->port->netdev, packets_);
                if (!packets_->count) {
                    return;
                }

                for (i = 0; i < packets_->count; i++) {
                    packets_->packets[i]->md.in_port.odp_port = portno;
                }

                (*depth)++;
                dp_netdev_recirculate(pmd, packets_);
                (*depth)--;
                return;
            }
        }
        break;

    case OVS_ACTION_ATTR_USERSPACE:
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
            struct dp_packet_batch *orig_packets_ = packets_;
            struct dp_packet **packets = packets_->packets;
            const struct nlattr *userdata;
            struct dp_packet_batch usr_pkt;
            struct ofpbuf actions;
            struct flow flow;
            ovs_u128 ufid;
            bool clone = false;
            int i;

            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
            ofpbuf_init(&actions, 0);

            if (packets_->trunc) {
                if (!may_steal) {
                    dp_packet_batch_clone(&usr_pkt, packets_);
                    packets_ = &usr_pkt;
                    packets = packets_->packets;
                    clone = true;
                    dp_packet_batch_reset_cutlen(orig_packets_);
                }

                dp_packet_batch_apply_cutlen(packets_);
            }

            for (i = 0; i < packets_->count; i++) {
                flow_extract(packets[i], &flow);
                dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
                dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
                                            &ufid, &actions, userdata, now);
            }

            if (clone) {
                dp_packet_delete_batch(packets_, true);
            }

            ofpbuf_uninit(&actions);
            fat_rwlock_unlock(&dp->upcall_rwlock);

            return;
        }
        break;

    case OVS_ACTION_ATTR_RECIRC:
        if (*depth < MAX_RECIRC_DEPTH) {
            struct dp_packet_batch recirc_pkts;
            int i;

            if (!may_steal) {
               dp_packet_batch_clone(&recirc_pkts, packets_);
               packets_ = &recirc_pkts;
            }

            for (i = 0; i < packets_->count; i++) {
                packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
            }

            (*depth)++;
            dp_netdev_recirculate(pmd, packets_);
            (*depth)--;

            return;
        }

        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
        break;

    case OVS_ACTION_ATTR_CT: {
        const struct nlattr *b;
        bool commit = false;
        unsigned int left;
        uint16_t zone = 0;
        const char *helper = NULL;
        const uint32_t *setmark = NULL;
        const struct ovs_key_ct_labels *setlabel = NULL;

        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
                                 nl_attr_get_size(a)) {
            enum ovs_ct_attr sub_type = nl_attr_type(b);

            switch(sub_type) {
            case OVS_CT_ATTR_COMMIT:
                commit = true;
                break;
            case OVS_CT_ATTR_ZONE:
                zone = nl_attr_get_u16(b);
                break;
            case OVS_CT_ATTR_HELPER:
                helper = nl_attr_get_string(b);
                break;
            case OVS_CT_ATTR_MARK:
                setmark = nl_attr_get(b);
                break;
            case OVS_CT_ATTR_LABELS:
                setlabel = nl_attr_get(b);
                break;
            case OVS_CT_ATTR_NAT:
            case OVS_CT_ATTR_UNSPEC:
            case __OVS_CT_ATTR_MAX:
                OVS_NOT_REACHED();
            }
        }

        conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, commit,
                          zone, setmark, setlabel, helper);
        break;
    }

    case OVS_ACTION_ATTR_PUSH_VLAN:
    case OVS_ACTION_ATTR_POP_VLAN:
    case OVS_ACTION_ATTR_PUSH_MPLS:
    case OVS_ACTION_ATTR_POP_MPLS:
    case OVS_ACTION_ATTR_SET:
    case OVS_ACTION_ATTR_SET_MASKED:
    case OVS_ACTION_ATTR_SAMPLE:
    case OVS_ACTION_ATTR_HASH:
    case OVS_ACTION_ATTR_UNSPEC:
    case OVS_ACTION_ATTR_TRUNC:
    case __OVS_ACTION_ATTR_MAX:
        OVS_NOT_REACHED();
    }

    dp_packet_delete_batch(packets_, may_steal);
}

static void
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                          struct dp_packet_batch *packets,
                          bool may_steal, const struct flow *flow,
                          const struct nlattr *actions, size_t actions_len,
                          long long now)
{
    struct dp_netdev_execute_aux aux = { pmd, now, flow };

    odp_execute_actions(&aux, packets, may_steal, actions,
                        actions_len, dp_execute_cb);
}

struct dp_netdev_ct_dump {
    struct ct_dpif_dump_state up;
    struct conntrack_dump dump;
    struct conntrack *ct;
    struct dp_netdev *dp;
};

static int
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
                          const uint16_t *pzone)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_ct_dump *dump;

    dump = xzalloc(sizeof *dump);
    dump->dp = dp;
    dump->ct = &dp->conntrack;

    conntrack_dump_start(&dp->conntrack, &dump->dump, pzone);

    *dump_ = &dump->up;

    return 0;
}

static int
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
                         struct ct_dpif_dump_state *dump_,
                         struct ct_dpif_entry *entry)
{
    struct dp_netdev_ct_dump *dump;

    INIT_CONTAINER(dump, dump_, up);

    return conntrack_dump_next(&dump->dump, entry);
}

static int
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
                         struct ct_dpif_dump_state *dump_)
{
    struct dp_netdev_ct_dump *dump;
    int err;

    INIT_CONTAINER(dump, dump_, up);

    err = conntrack_dump_done(&dump->dump);

    free(dump);

    return err;
}

static int
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    return conntrack_flush(&dp->conntrack, zone);
}

const struct dpif_class dpif_netdev_class = {
    "netdev",
    dpif_netdev_init,
    dpif_netdev_enumerate,
    dpif_netdev_port_open_type,
    dpif_netdev_open,
    dpif_netdev_close,
    dpif_netdev_destroy,
    dpif_netdev_run,
    dpif_netdev_wait,
    dpif_netdev_get_stats,
    dpif_netdev_port_add,
    dpif_netdev_port_del,
    dpif_netdev_port_set_config,
    dpif_netdev_port_query_by_number,
    dpif_netdev_port_query_by_name,
    NULL,                       /* port_get_pid */
    dpif_netdev_port_dump_start,
    dpif_netdev_port_dump_next,
    dpif_netdev_port_dump_done,
    dpif_netdev_port_poll,
    dpif_netdev_port_poll_wait,
    dpif_netdev_flow_flush,
    dpif_netdev_flow_dump_create,
    dpif_netdev_flow_dump_destroy,
    dpif_netdev_flow_dump_thread_create,
    dpif_netdev_flow_dump_thread_destroy,
    dpif_netdev_flow_dump_next,
    dpif_netdev_operate,
    NULL,                       /* recv_set */
    NULL,                       /* handlers_set */
    dpif_netdev_pmd_set,
    dpif_netdev_queue_to_priority,
    NULL,                       /* recv */
    NULL,                       /* recv_wait */
    NULL,                       /* recv_purge */
    dpif_netdev_register_dp_purge_cb,
    dpif_netdev_register_upcall_cb,
    dpif_netdev_enable_upcall,
    dpif_netdev_disable_upcall,
    dpif_netdev_get_datapath_version,
    dpif_netdev_ct_dump_start,
    dpif_netdev_ct_dump_next,
    dpif_netdev_ct_dump_done,
    dpif_netdev_ct_flush,
};

static void
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
                              const char *argv[], void *aux OVS_UNUSED)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp;
    odp_port_t port_no;

    ovs_mutex_lock(&dp_netdev_mutex);
    dp = shash_find_data(&dp_netdevs, argv[1]);
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
        ovs_mutex_unlock(&dp_netdev_mutex);
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
        return;
    }
    ovs_refcount_ref(&dp->ref_cnt);
    ovs_mutex_unlock(&dp_netdev_mutex);

    ovs_mutex_lock(&dp->port_mutex);
    if (get_port_by_name(dp, argv[2], &port)) {
        unixctl_command_reply_error(conn, "unknown port");
        goto exit;
    }

    port_no = u32_to_odp(atoi(argv[3]));
    if (!port_no || port_no == ODPP_NONE) {
        unixctl_command_reply_error(conn, "bad port number");
        goto exit;
    }
    if (dp_netdev_lookup_port(dp, port_no)) {
        unixctl_command_reply_error(conn, "port number already in use");
        goto exit;
    }

    /* Remove port. */
    hmap_remove(&dp->ports, &port->node);
    dp_netdev_del_port_from_all_pmds(dp, port);

    /* Reinsert with new port number. */
    port->port_no = port_no;
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
    dp_netdev_add_port_to_pmds(dp, port);

    seq_change(dp->port_seq);
    unixctl_command_reply(conn, NULL);

exit:
    ovs_mutex_unlock(&dp->port_mutex);
    dp_netdev_unref(dp);
}

static void
dpif_dummy_register__(const char *type)
{
    struct dpif_class *class;

    class = xmalloc(sizeof *class);
    *class = dpif_netdev_class;
    class->type = xstrdup(type);
    dp_register_provider(class);
}

static void
dpif_dummy_override(const char *type)
{
    int error;

    /*
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
     * a userland-only build.  It's useful for testsuite.
     */
    error = dp_unregister_provider(type);
    if (error == 0 || error == EAFNOSUPPORT) {
        dpif_dummy_register__(type);
    }
}

void
dpif_dummy_register(enum dummy_level level)
{
    if (level == DUMMY_OVERRIDE_ALL) {
        struct sset types;
        const char *type;

        sset_init(&types);
        dp_enumerate_types(&types);
        SSET_FOR_EACH (type, &types) {
            dpif_dummy_override(type);
        }
        sset_destroy(&types);
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
        dpif_dummy_override("system");
    }

    dpif_dummy_register__("dummy");

    unixctl_command_register("dpif-dummy/change-port-number",
                             "dp port new-number",
                             3, 3, dpif_dummy_change_port_number, NULL);
}

/* Datapath Classifier. */

/* A set of rules that all have the same fields wildcarded. */
struct dpcls_subtable {
    /* The fields are only used by writers. */
    struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */

    /* These fields are accessed by readers. */
    struct cmap rules;           /* Contains "struct dpcls_rule"s. */
    uint32_t hit_cnt;            /* Number of match hits in subtable in current
                                    optimization interval. */
    struct netdev_flow_key mask; /* Wildcards for fields (const). */
    /* 'mask' must be the last field, additional space is allocated here. */
};

/* Initializes 'cls' as a classifier that initially contains no classification
 * rules. */
static void
dpcls_init(struct dpcls *cls)
{
    cmap_init(&cls->subtables_map);
    pvector_init(&cls->subtables);
}

static void
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
{
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
    pvector_remove(&cls->subtables, subtable);
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
                subtable->mask.hash);
    cmap_destroy(&subtable->rules);
    ovsrcu_postpone(free, subtable);
}

/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
 * caller's responsibility.
 * May only be called after all the readers have been terminated. */
static void
dpcls_destroy(struct dpcls *cls)
{
    if (cls) {
        struct dpcls_subtable *subtable;

        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
            ovs_assert(cmap_count(&subtable->rules) == 0);
            dpcls_destroy_subtable(cls, subtable);
        }
        cmap_destroy(&cls->subtables_map);
        pvector_destroy(&cls->subtables);
    }
}

static struct dpcls_subtable *
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable;

    /* Need to add one. */
    subtable = xmalloc(sizeof *subtable
                       - sizeof subtable->mask.mf + mask->len);
    cmap_init(&subtable->rules);
    subtable->hit_cnt = 0;
    netdev_flow_key_clone(&subtable->mask, mask);
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
    /* Add the new subtable at the end of the pvector (with no hits yet) */
    pvector_insert(&cls->subtables, subtable, 0);
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
    pvector_publish(&cls->subtables);

    return subtable;
}

static inline struct dpcls_subtable *
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable;

    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
                             &cls->subtables_map) {
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
            return subtable;
        }
    }
    return dpcls_create_subtable(cls, mask);
}


/* Periodically sort the dpcls subtable vectors according to hit counts */
static void
dpcls_sort_subtable_vector(struct dpcls *cls)
{
    struct pvector *pvec = &cls->subtables;
    struct dpcls_subtable *subtable;

    PVECTOR_FOR_EACH (subtable, pvec) {
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
        subtable->hit_cnt = 0;
    }
    pvector_publish(pvec);
}

static inline void
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)
{
    struct dpcls *cls;
    long long int now = time_msec();

    if (now > pmd->next_optimization) {
        /* Try to obtain the flow lock to block out revalidator threads.
         * If not possible, just try next time. */
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
            /* Optimize each classifier */
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
                dpcls_sort_subtable_vector(cls);
            }
            ovs_mutex_unlock(&pmd->flow_mutex);
            /* Start new measuring interval */
            pmd->next_optimization = now + DPCLS_OPTIMIZATION_INTERVAL;
        }
    }
}

/* Insert 'rule' into 'cls'. */
static void
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
             const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);

    /* Refer to subtable's mask, also for later removal. */
    rule->mask = &subtable->mask;
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
}

/* Removes 'rule' from 'cls', also destructing the 'rule'. */
static void
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
{
    struct dpcls_subtable *subtable;

    ovs_assert(rule->mask);

    /* Get subtable from reference in rule->mask. */
    INIT_CONTAINER(subtable, rule->mask, mask);
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
        == 0) {
        /* Delete empty subtable. */
        dpcls_destroy_subtable(cls, subtable);
        pvector_publish(&cls->subtables);
    }
}

/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
 * in 'mask' the values in 'key' and 'target' are the same. */
static inline bool
dpcls_rule_matches_key(const struct dpcls_rule *rule,
                       const struct netdev_flow_key *target)
{
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
    uint64_t value;

    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
            return false;
        }
    }
    return true;
}

/* For each miniflow in 'keys' performs a classifier lookup writing the result
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
 * NULL it is skipped.
 *
 * This function is optimized for use in the userspace datapath and therefore
 * does not implement a lot of features available in the standard
 * classifier_lookup() function.  Specifically, it does not implement
 * priorities, instead returning any rule which matches the flow.
 *
 * Returns true if all miniflows found a corresponding rule. */
static bool
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
             struct dpcls_rule **rules, const size_t cnt,
             int *num_lookups_p)
{
    /* The received 'cnt' miniflows are the search-keys that will be processed
     * to find a matching entry into the available subtables.
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
    typedef uint32_t map_type;
#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);

    struct dpcls_subtable *subtable;

    map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
    map_type found_map;
    uint32_t hashes[MAP_BITS];
    const struct cmap_node *nodes[MAP_BITS];

    if (cnt != MAP_BITS) {
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
    }
    memset(rules, 0, cnt * sizeof *rules);

    int lookups_match = 0, subtable_pos = 1;

    /* The Datapath classifier - aka dpcls - is composed of subtables.
     * Subtables are dynamically created as needed when new rules are inserted.
     * Each subtable collects rules with matches on a specific subset of packet
     * fields as defined by the subtable's mask.  We proceed to process every
     * search-key against each subtable, but when a match is found for a
     * search-key, the search for that key can stop because the rules are
     * non-overlapping. */
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
        int i;

        /* Compute hashes for the remaining keys.  Each search-key is
         * masked with the subtable's mask to avoid hashing the wildcarded
         * bits. */
        ULLONG_FOR_EACH_1(i, keys_map) {
            hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
                                                     &subtable->mask);
        }
        /* Lookup. */
        found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
        /* Check results.  When the i-th bit of found_map is set, it means
         * that a set of nodes with a matching hash value was found for the
         * i-th search-key.  Due to possible hash collisions we need to check
         * which of the found rules, if any, really matches our masked
         * search-key. */
        ULLONG_FOR_EACH_1(i, found_map) {
            struct dpcls_rule *rule;

            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
                if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
                    rules[i] = rule;
                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
                     * within one second optimization interval. */
                    subtable->hit_cnt++;
                    lookups_match += subtable_pos;
                    goto next;
                }
            }
            /* None of the found rules was a match.  Reset the i-th bit to
             * keep searching this key in the next subtable. */
            ULLONG_SET0(found_map, i);  /* Did not match. */
        next:
            ;                     /* Keep Sparse happy. */
        }
        keys_map &= ~found_map;             /* Clear the found rules. */
        if (!keys_map) {
            if (num_lookups_p) {
                *num_lookups_p = lookups_match;
            }
            return true;              /* All found. */
        }
        subtable_pos++;
    }
    if (num_lookups_p) {
        *num_lookups_p = lookups_match;
    }
    return false;                     /* Some misses. */
}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/*
-												dpif-netdev: Avoid copying netdev_flow_key in emc_processing().

Before this commit, emc_processing() copied a netdev_flow_key if there was
no exact-match cache (EMC) hit.  This commit eliminates the copy by
constructing the netdev_flow_key in the place it would be copied.

Found by inspection.

Shahbaz (CCed) reports that this reduces the cost of an EMC miss by 72
cycles in his test case in which the EMC is disabled.  Presumably this
is similarly valuable in cases where the EMC merely has few hits.

For the original version of this patch, which was against a slightly
earlier version of OVS, Daniele reported that:

    - With EMC disabled, this increases throughput from 4.8 Mpps to 5.4
      Mpps.

    - With EMC enabled, this decreases throughput from 12.4 to 12.0 Mpps.

CC: Muhammad Shahbaz <mshahbaz@cs.princeton.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-24 08:32:36 -08:00
+								 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								 *
 								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at:
 								 *
 								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
 								#include <config.h>
-												netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads

DPDK mempools rely on rte_lcore_id() to implement a thread-local cache.
Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to
the "thread-local" cache, causing crashes.

This commit resolves the issue with the following changes:

- Every non pmd thread has the same lcore_id (0, for management reasons), which
  is not shared with any pmd thread (lcore_id for pmd threads now start from 1)
- DPDK mbufs must be allocated/freed in pmd threads. When there is the need to
  use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be
  held.
- The previous change does not allow us anymore to pass DPDK mbufs to handler
  threads: therefore this commit partially revert 143859ec63d45e. Now packets
  are copied for upcall processing. We can remove the extra memcpy by
  processing upcalls in the pmd thread itself.

With the introduction of the extra locking, the packet throughput will be lower
in the following cases:

- When using internal (tap) devices with DPDK devices on the same datapath.
  Anyway, to support internal devices efficiently, we needed DPDK KNI devices,
  which will be proper pmd devices and will not need this locking.
- When packets are processed in the slow path by non pmd threads. This overhead
  can be avoided by handling the upcalls directly in pmd threads (a change that
  has already been proposed by Ryan Wilson)

Also, the following two fixes have been introduced:
- In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put().
  This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option
- Do not bulk free mbufs in a transmission queue. They may belong to different
  mempools

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-07-17 14:29:36 -07:00
+								#include "dpif-netdev.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								#include <ctype.h>
 								#include <errno.h>
 								#include <fcntl.h>
 								#include <inttypes.h>
-												Work around bugs in system headers.

On some system, at least, one must include <sys/types.h> before
<netinet/in.h>, and <netinet/in.h> before <arpa/inet.h> or <net/if.h>.

From Jean Tourrilhes <jt@hpl.hp.com>.

											
										
										
											2010-02-12 12:51:36 -08:00
+								#include <net/if.h>
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include <netinet/in.h>
-												datapath: Replace "struct odp_action" by Netlink attributes.

In the medium term, we plan to migrate the datapath to use Netlink as its
communication channel.  In the short term, we need to be able to have
actions with 64-bit arguments but "struct odp_action" only has room for
48 bits.  So this patch shifts to variable-length arguments using Netlink
attributes, which starts in on the Netlink transition and makes 64-bit
arguments possible at the same time.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 10:40:58 -08:00
+								#include <stdint.h>
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include <stdlib.h>
 								#include <string.h>
 								#include <sys/ioctl.h>
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include <sys/socket.h>
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include <sys/stat.h>
 								#include <unistd.h>
-												dpdk: New module with some code from netdev-dpdk.

There's a lot of code in netdev-dpdk which is not at all related to the
netdev interface, mostly the library initialization code.

This commit moves it to a new 'dpdk' module, to simplify 'netdev-dpdk'.

Also a new module 'dpdk-stub' is introduced to implement some functions
when DPDK is not available.  This replaces the old 'netdev-nodpdk'
module.

Some redundant includes are removed or reorganized as a consequence.

No functional change.

CC: Aaron Conole <aconole@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Aaron Conole <aconole@redhat.com>
Tested-by: Aaron Conole <aconole@redhat.com>

											
										
										
											2016-10-04 17:58:05 -07:00
+								#ifdef DPDK_NETDEV
 								#include <rte_cycles.h>
 								#endif
-												dpif-netdev: Don't use metaflow to operate on userspace datapath fields.

If ofproto-dpif installs a flow into the userspace datapath that doesn't
include a mask, we need to synthesize an exact match one. This is currently
done using the metaflow infrastructure, iterating over each field and
setting it to all ones.

There is a conceptual mismatch here because metaflow is operating on
OpenFlow fields, not datapath ones. Even though they are generally very
similar, there are subtle differences, which is why it is necessary to
fix up the input port mask.

With Geneve options, the mapping is much more complicated and so the
situation is worse. The first issue is that the metaflow to flow
mapping can change over time, so we would need to do more revalidation
to track this. In addition, an upcoming patch will completely disconnect
the option format between ofproto-dpif and dpif-netdev, so the values
written by metaflow don't make sense at all.

When megaflows are turned off, ofproto-dpif internally generates masks
using flow_wildcards_init_for_packet(). Since that's the same as what
we want to do here, we can just use that instead of metaflow.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-07-16 09:05:33 -07:00
+								#include "bitmap.h"
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								#include "cmap.h"
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#include "conntrack.h"
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include "coverage.h"
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#include "ct-dpif.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "csum.h"
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								#include "dp-packet.h"
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								#include "dpif.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "dpif-provider.h"
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								#include "dummy.h"
-												lib/classifier: Lockless lookups.

Now that all the relevant classifier structures use RCU and internal
mutual exclusion for modifications, we can remove the fat-rwlock and
thus make the classifier lookups lockless.

As the readers are operating concurrently with the writers, a
concurrent reader may or may not see a new rule being added by a
writer, depending on how the concurrent events overlap with each
other.  Overall, this is no different from the former locked behavior,
but there the visibility of the new rule only depended on the timing
of the locking functions.

A new rule is first added to the segment indices, so the readers may
find the rule in the indices before the rule is visible in the
subtables 'rules' map.  This may result in us losing the opportunity
to quit lookups earlier, resulting in sub-optimal wildcarding.  This
will be fixed by forthcoming revalidation always scheduled after flow
table changes.

Similar behavior may happen due to us removing the overlapping rule
(if any) from the indices only after the corresponding new rule has
been added.

The subtable's max priority is updated only after a rule is inserted
to the maps, so the concurrent readers may not see the rule, as the
updated priority ordered subtable list will only be visible after the
subtable's max priority is updated.

Similarly, the classifier's partitions are updated by the caller after
the rule is inserted to the maps, so the readers may keep skipping the
subtable until they see the updated partitions.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
											
										
										
											2014-07-11 02:29:08 -07:00
+								#include "fat-rwlock.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "flow.h"
-												dpif-netdev: Reload each thread only once in do_add_port.

While adding of pmd interface with multiple queues several queues
may be assigned to one thread and this thread will be reloaded
one time for each added queue.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:29 +03:00
+								#include "hmapx.h"
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								#include "latch.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "netdev.h"
-												netdev: New function netdev_get_dpif_port().

In future patches, a netdev's datapath port name may not
necessarily be the same as its device name. This patch prepares for
this by making the distinction in the netdev and dpif layers.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-12-16 17:08:50 -08:00
+								#include "netdev-vport.h"
-												datapath: Replace "struct odp_action" by Netlink attributes.

In the medium term, we plan to migrate the datapath to use Netlink as its
communication channel.  In the short term, we need to be able to have
actions with 64-bit arguments but "struct odp_action" only has room for
48 bits.  So this patch shifts to variable-length arguments using Netlink
attributes, which starts in on the Netlink transition and makes 64-bit
arguments possible at the same time.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 10:40:58 -08:00
+								#include "netlink.h"
-												odp-execute: New module for executing datapath actions.

This moves generic action execution code out of lib/dpif-netedev.c
and into a new file, lib/odp-execute.c.

This is in preparation for using odp_execute_actions()
in lib/odp-util.c to handle recirculation/

Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-29 15:06:38 +09:00
+								#include "odp-execute.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "odp-util.h"
-												Move lib/ofp-print.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-14 15:20:21 -07:00
+								#include "openvswitch/dynamic-string.h"
 								#include "openvswitch/list.h"
 								#include "openvswitch/match.h"
 								#include "openvswitch/ofp-print.h"
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								#include "openvswitch/ofp-util.h"
-												Move lib/ofpbuf.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Acked-by: Ryan Moats <rmoats@us.ibm.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-03-25 14:10:24 -07:00
+								#include "openvswitch/ofpbuf.h"
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								#include "openvswitch/shash.h"
-												Move lib/ofp-print.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-14 15:20:21 -07:00
+								#include "openvswitch/vlog.h"
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								#include "ovs-numa.h"
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								#include "ovs-rcu.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "packets.h"
 								#include "poll-loop.h"
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								#include "pvector.h"
-												dpif-netdev: Implement OVS_ACTION_ATTR_SAMPLE action.

OVS_ACTION_ATTR_SAMPLE has never been implemented in dpif-netdev.  This
commit implements it and adds a cast to enum ovs_action_type in the switch
statement that checks the action type, so that GCC complains if we forget
to add a case for a new action type.

I had to assign the return value of nl_attr_type() to a temporary variable,
because "switch ((enum ovs_action_type) nl_attr_type(a))" provoked a GCC
warning that I've never seen before:

../lib/dpif-netdev.c:1260: warning: cast from function call of type 'int'
     to non-matching type 'enum ovs_action_type'

											
										
										
											2011-10-11 11:07:14 -07:00
+								#include "random.h"
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								#include "seq.h"
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								#include "smap.h"
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								#include "sset.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "timeval.h"
-												tnl-arp-cache: Rename module and functions to tnl-neigh-cache.

Since we don't distinguish between IPv4 and IPv6 lookups, consolidate ARP
and ND cache into neighbor cache. Other references to ARP related to the
ARP cache but that are not really about ARP have been renamed as well.
tnl_arp_lookup is kept for lookups using IPv4 instead of IPv4-mapped
addresses, but that is going to be removed in a later patch.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-11-30 16:24:49 -02:00
+								#include "tnl-neigh-cache.h"
-												tnl-ports: Add destination IP and MAC address to the match.

Currently tnl-port table wildcard destination ip and mac addresses
for given tunnel packet.  That could result accepting tunnel
packets destined for other hosts.  Following patch adds
support for matching for ip and mac address.
IP address upates to tnl-port table are piggybacked on
ovs-router updates.

Reported-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-09-03 00:42:34 -07:00
+								#include "tnl-ports.h"
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								#include "unixctl.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "util.h"
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
-												vlog: Make client supply semicolon for VLOG_DEFINE_THIS_MODULE.

It's kind of odd for VLOG_DEFINE_THIS_MODULE to supply its own semicolon,
so this commit switches to the more common form.

											
										
										
											2010-10-19 14:47:01 -07:00
+								VLOG_DEFINE_THIS_MODULE(dpif_netdev);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								#define FLOW_DUMP_MAX_BATCH 50
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								/* Use per thread recirc_depth to prevent recirculation loop. */
 								#define MAX_RECIRC_DEPTH 5
 								DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* Configuration parameters. */
 								enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Protects against changes to 'dp_netdevs'. */
 								static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
 								/* Contains all 'struct dp_netdev's. */
 								static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 								    = SHASH_INITIALIZER(&dp_netdevs);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 								                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED)
 								#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								static struct odp_support dp_netdev_support = {
 								    .max_mpls_depth = SIZE_MAX,
 								    .recirc = true,
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    .ct_state = true,
 								    .ct_zone = true,
 								    .ct_mark = true,
 								    .ct_label = true,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								};
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								/* Stores a miniflow with inline values */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
 								struct netdev_flow_key {
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    uint32_t hash;       /* Hash function differs for different users. */
 								    uint32_t len;        /* Length of the following miniflow (incl. map). */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct miniflow mf;
-												flow: Always inline miniflows.

Now that performance critical code already inlines miniflows and
minimasks, we can simplify struct miniflow by always dynamically
allocating miniflows and minimasks to the correct size.  This changes
the struct minimatch to always contain pointers to its miniflow and
minimask.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-15 13:17:01 -07:00
+								    uint64_t buf[FLOW_MAX_PACKET_U64S];
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
 								/* Exact match cache for frequently used flows
 								 *
 								 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 								 * search its entries for a miniflow that matches exactly the miniflow of the
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 *
 								 * A cache entry holds a reference to its 'dp_netdev_flow'.
 								 *
 								 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 								 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 								 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 								 * value is the index of a cache entry where the miniflow could be.
 								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
 								 * Each pmd_thread has its own private exact match cache.
 								 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 								 */
-												dpif-netdev: Increase the number of EMC entries

Prior to this commit, the number of possible entries in the Exact
Match Cache stood at 1024 per thread exacting to 0.18Mb. A typical
server system will have 2.5Mb cache per core meaning a larger EMC will
comfortably fit in. This patch increases the number of entries to 8192
per thread (1.4Mb) which in turn yields improved throughput when
processing multiple flows of traffic.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-13 14:54:56 +01:00
+								#define EM_FLOW_HASH_SHIFT 13
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 								#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 								#define EM_FLOW_HASH_SEGS 2
 								struct emc_entry {
 								    struct dp_netdev_flow *flow;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct netdev_flow_key key;   /* key.hash used for emc hash value. */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
 								struct emc_cache {
 								    struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								    int sweep_idx;                /* For emc_cache_slow_sweep(). */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
 								/* Iterate in the exact match cache through every entry that might contain a
 								 * miniflow with hash 'HASH'. */
 								#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 								    for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 								         (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 								         i__ < EM_FLOW_HASH_SEGS;                                            \
 								         i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Simple non-wildcarding single-priority classifier. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								/* Time in ms between successive optimizations of the dpcls subtable vector */
 								#define DPCLS_OPTIMIZATION_INTERVAL 1000
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								struct dpcls {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 								    odp_port_t in_port;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct cmap subtables_map;
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    struct pvector subtables;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								};
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								/* A rule to be inserted to the classifier. */
 								struct dpcls_rule {
 								    struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 								    struct netdev_flow_key *mask; /* Subtable's mask. */
 								    struct netdev_flow_key flow;  /* Matching key. */
 								    /* 'flow' must be the last field, additional space is allocated here. */
 								};
 								static void dpcls_init(struct dpcls *);
 								static void dpcls_destroy(struct dpcls *);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static void dpcls_sort_subtable_vector(struct dpcls *);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 								                         const struct netdev_flow_key *mask);
 								static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static bool dpcls_lookup(struct dpcls *cls,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                         const struct netdev_flow_key keys[],
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                         struct dpcls_rule **rules, size_t cnt,
 								                         int *num_lookups_p);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Datapath based on the network device interface from netdev.h.
 								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
 								 * Some members, marked 'const', are immutable.  Accessing other members
 								 * requires synchronization, as noted in more detail below.
 								 *
 								 * Acquisition order is, from outermost to innermost:
 								 *
 								 *    dp_netdev_mutex (global)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								 *    port_mutex
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								 *    non_pmd_mutex
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								struct dp_netdev {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    const struct dpif_class *const class;
 								    const char *const name;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    struct dpif *dpif;
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    struct ovs_refcount ref_cnt;
 								    atomic_flag destroyed;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    /* Ports.
 								     *
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								     * Any lookup into 'ports' or any access to the dp_netdev_ports found
 								     * through 'ports' requires taking 'port_mutex'. */
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    struct ovs_mutex port_mutex;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap ports;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    struct seq *port_seq;       /* Incremented whenever a port changes. */
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    /* Protects access to ofproto-dpif-upcall interface during revalidator
 								     * thread synchronization. */
 								    struct fat_rwlock upcall_rwlock;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 								    void *upcall_aux;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    /* Callback function for notifying the purging of dp flows (during
 								     * reseting pmd deletion). */
 								    dp_purge_callback *dp_purge_cb;
 								    void *dp_purge_aux;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Stores all 'struct dp_netdev_pmd_thread's. */
 								    struct cmap poll_threads;
 								    /* Protects the access of the 'struct dp_netdev_pmd_thread'
 								     * instance for non-pmd thread. */
 								    struct ovs_mutex non_pmd_mutex;
 								    /* Each pmd thread will store its pointer to
 								     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 								    ovsthread_key_t per_pmd_key;
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    struct seq *reconfigure_seq;
 								    uint64_t last_reconfigure_seq;
-												dpif-netdev: Allow different numbers of rx queues for different ports.

Currently, all of the PMD netdevs can only have the same number of
rx queues, which is specified in other_config:n-dpdk-rxqs.

Fix that by introducing of new option for PMD interfaces: 'n_rxq', which
specifies the maximum number of rx queues to be created for this
interface.

Example:
	ovs-vsctl set Interface dpdk0 options:n_rxq=8

Old 'other_config:n-dpdk-rxqs' deleted.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-21 17:15:18 +03:00
+								    /* Cpu mask for pin of pmd threads. */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    char *pmd_cmask;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    uint64_t last_tnl_conf_seq;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
 								    struct conntrack conntrack;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                                                    odp_port_t)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								enum dp_stat_type {
-												dpif-netdev: Count exact match cache hits.

We used to count exact match cache hits and masked classifier hits
together. This commit splits the DP_STAT_HIT counter into two.
This change will be used by future commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:15 +01:00
+								    DP_STAT_EXACT_HIT,          /* Packets that had an exact match (emc). */
 								    DP_STAT_MASKED_HIT,         /* Packets that matched in the flow table. */
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								    DP_STAT_MISS,               /* Packets that did not match. */
 								    DP_STAT_LOST,               /* Packets not passed up to the client. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    DP_STAT_LOOKUP_HIT,         /* Number of subtable lookups for flow table
 								                                   hits */
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								    DP_N_STATS
 								};
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								enum pmd_cycles_counter_type {
 								    PMD_CYCLES_POLLING,         /* Cycles spent polling NICs. */
 								    PMD_CYCLES_PROCESSING,      /* Cycles spent processing packets */
 								    PMD_N_CYCLES
 								};
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								#define XPS_TIMEOUT_MS 500LL
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Contained by struct dp_netdev_port's 'rxqs' member.  */
 								struct dp_netdev_rxq {
 								    struct netdev_rxq *rxq;
 								    unsigned core_id;           /* Сore to which this queue is pinned. */
 								};
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* A port in a netdev-based datapath. */
 								struct dp_netdev_port {
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    odp_port_t port_no;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct netdev *netdev;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    struct netdev_saved_flags *sf;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    struct dp_netdev_rxq *rxqs;
-												dpif-netdev: Reorder elements in dp_netdev_port structure.

By reordering the data elements in dp_netdev_port structure, pad bytes
can be reduced and there by saving a cache line.

Before: structure size:136, holes:3, sum padbytes:15, cachelines:3
After: structure size:128, holes:2, sum padbytes:7, cachelines:2

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-14 15:37:11 +01:00
+								    unsigned n_rxq;             /* Number of elements in 'rxq' */
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    bool dynamic_txqs;          /* If true XPS will be used. */
 								    unsigned *txq_used;         /* Number of threads that uses each tx queue. */
 								    struct ovs_mutex txq_used_mutex;
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								    char *type;                 /* Port type as requested by user. */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Contained by struct dp_netdev_flow's 'stats' member.  */
 								struct dp_netdev_flow_stats {
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    atomic_llong used;             /* Last used time, in monotonic msecs. */
 								    atomic_ullong packet_count;    /* Number of packets matched. */
 								    atomic_ullong byte_count;      /* Number of bytes matched. */
 								    atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								};
 								/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
 								 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 * its pmd thread's classifier.  The text below calls this classifier 'cls'.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * Motivation
 								 * ----------
 								 *
 								 * The thread safety rules described here for "struct dp_netdev_flow" are
 								 * motivated by two goals:
 								 *
 								 *    - Prevent threads that read members of "struct dp_netdev_flow" from
 								 *      reading bad data due to changes by some thread concurrently modifying
 								 *      those members.
 								 *
 								 *    - Prevent two threads making changes to members of a given "struct
 								 *      dp_netdev_flow" from interfering with each other.
 								 *
 								 *
 								 * Rules
 								 * -----
 								 *
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								 * A flow 'flow' may be accessed without a risk of being freed during an RCU
 								 * grace period.  Code that needs to hold onto a flow for a while
 								 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 								 * from modification.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * Some members, marked 'const', are immutable.  Accessing other members
 								 * requires synchronization, as noted in more detail below.
 								 */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								struct dp_netdev_flow {
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    const struct flow flow;      /* Unmasked flow that created this entry. */
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    /* Hash table index by unmasked flow. */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 								                                 /* 'flow_table'. */
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    const ovs_u128 ufid;         /* Unique flow identifier. */
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								                                 /* flow. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								    /* Number of references.
 								     * The classifier owns one reference.
 								     * Any thread trying to keep a rule from being freed should hold its own
 								     * reference. */
 								    struct ovs_refcount ref_cnt;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    bool dead;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    /* Statistics. */
 								    struct dp_netdev_flow_stats stats;
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Remove unused members

Simplify code and update comments after commit 61e7deb1.
("dpif-netdev: Use RCU to protect data.")

Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>

											
										
										
											2014-04-15 14:59:30 +09:00
+								    /* Actions. */
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    OVSRCU_TYPE(struct dp_netdev_actions *) actions;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    /* While processing a group of input packets, the datapath uses the next
 								     * member to store a pointer to the output batch for the flow.  It is
 								     * reset after the batch has been sent out (See dp_netdev_queue_batches(),
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								     * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 								    struct packet_batch_per_flow *batch;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Packet classification. */
 								    struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 								    /* 'cr' must be the last member. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								static void dp_netdev_flow_unref(struct dp_netdev_flow *);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 								                                         struct flow *);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								/* A set of datapath actions within a "struct dp_netdev_flow".
 								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
-												dpif-netdev: Remove unused members

Simplify code and update comments after commit 61e7deb1.
("dpif-netdev: Use RCU to protect data.")

Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>

											
										
										
											2014-04-15 14:59:30 +09:00
+								 * A struct dp_netdev_actions 'actions' is protected with RCU. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								struct dp_netdev_actions {
 								    /* These members are immutable: they do not change during the struct's
 								     * lifetime.  */
 								    unsigned int size;          /* Size of 'actions', in bytes. */
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								    struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								};
 								struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 								                                                   size_t);
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								struct dp_netdev_actions *dp_netdev_flow_get_actions(
 								    const struct dp_netdev_flow *);
 								static void dp_netdev_actions_free(struct dp_netdev_actions *);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Contained by struct dp_netdev_pmd_thread's 'stats' member.  */
 								struct dp_netdev_pmd_stats {
 								    /* Indexed by DP_STAT_*. */
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    atomic_ullong n[DP_N_STATS];
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								};
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								/* Contained by struct dp_netdev_pmd_thread's 'cycle' member.  */
 								struct dp_netdev_pmd_cycles {
 								    /* Indexed by PMD_CYCLES_*. */
 								    atomic_ullong n[PMD_N_CYCLES];
 								};
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 								struct rxq_poll {
 								    struct dp_netdev_port *port;
 								    struct netdev_rxq *rx;
 								    struct ovs_list node;
 								};
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 								 * 'tnl_port_cache' or 'tx_ports'. */
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								struct tx_port {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    struct dp_netdev_port *port;
 								    int qid;
 								    long long last_used;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct hmap_node node;
 								};
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								/* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 								 * the performance overhead of interrupt processing.  Therefore netdev can
 								 * not implement rx-wait for these devices.  dpif-netdev needs to poll
 								 * these device to check for recv buffer.  pmd-thread does polling for
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 * devices assigned to itself.
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								 *
 								 * DPDK used PMD for accessing NIC.
 								 *
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 								 * I/O of all non-pmd threads.  There will be no actual thread created
 								 * for the instance.
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 *
 								 * Each struct has its own flow table and classifier.  Packets received
 								 * from managed ports are looked up in the corresponding pmd thread's
 								 * flow table, and are executed with the found actions.
 								 * */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								struct dp_netdev_pmd_thread {
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    struct dp_netdev *dp;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct cmap_node node;          /* In 'dp->poll_threads'. */
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
 								    pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
 								    struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Per thread exact-match cache.  Note, the instance for cpu core
 								     * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								     * need to be protected by 'non_pmd_mutex'.  Every other instance
 								     * will only be accessed by its own pmd thread. */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    struct emc_cache flow_cache;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Flow-Table and classifiers
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								     *
 								     * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								     * changes to 'classifiers' must be made while still holding the
 								     * 'flow_mutex'.
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								     */
 								    struct ovs_mutex flow_mutex;
 								    struct cmap flow_table OVS_GUARDED; /* Flow table. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* One classifier per in_port polled by the pmd */
 								    struct cmap classifiers;
 								    /* Periodically sort subtable vectors according to hit frequencies */
 								    long long int next_optimization;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    /* Statistics. */
 								    struct dp_netdev_pmd_stats stats;
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								    /* Cycles counters */
 								    struct dp_netdev_pmd_cycles cycles;
 								    /* Used to count cicles. See 'cycles_counter_end()' */
 								    unsigned long long last_cycles;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct latch exit_latch;        /* For terminating the pmd thread. */
 								    atomic_uint change_seq;         /* For reloading pmd ports. */
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    pthread_t thread;
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    unsigned core_id;               /* CPU core id of this pmd thread. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    int numa_id;                    /* numa node id of this pmd thread. */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    bool isolated;
-												netdev-dpdk: Obtain number of queues for vhost ports from attached virtio.

Currently, there are few inconsistencies in ways to configure number of
queues for netdev device:

	* dpif-netdev can't know about exact number of queues
	  allocated inside netdev.
	  This leads to constant mapping of queue-ids to 'real' ones.

	* We are able to configure 'n_rxq' for vhost-user devices, but
	  there is only one sane number of rx queues which must be used
	  and configured manually (number of queues that allocated
	  in QEMU).

This patch disables configuration of 'n_rxq' for DPDK vHost devices.
Configuration of rx and tx queues now automatically applied from
connected virtio device. Standard reconfiguration mechanism was used to
apply this changes.

Also, now 'n_txq' and 'n_rxq' are always the real numbers of queues
in the device.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-08 16:52:37 +03:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Queue id used by this pmd thread to send packets on all netdevs if
 								     * XPS disabled for this netdev. All static_tx_qid's are unique and less
 								     * than 'ovs_numa_get_n_cores() + 1'. */
 								    atomic_int static_tx_qid;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    /* List of rx queues to poll. */
 								    struct ovs_list poll_list OVS_GUARDED;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    /* Number of elements in 'poll_list' */
 								    int poll_cnt;
 								    /* Map of 'tx_port's used for transmission.  Written by the main thread,
 								     * read by the pmd thread. */
 								    struct hmap tx_ports OVS_GUARDED;
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 								     * ports (that support push_tunnel/pop_tunnel), the other contains ports
 								     * with at least one txq (that support send).  A port can be in both.
 								     *
 								     * There are two separate maps to make sure that we don't try to execute
 								     * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 								     *
 								     * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 								     * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 								     * other instance will only be accessed by its own pmd thread. */
 								    struct hmap tnl_port_cache;
 								    struct hmap send_port_cache;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    /* Only a pmd thread can write on its own 'cycles' and 'stats'.
 								     * The main thread keeps 'stats_zero' and 'cycles_zero' as base
 								     * values and subtracts them from 'stats' and 'cycles' before
 								     * reporting to the user */
 								    unsigned long long stats_zero[DP_N_STATS];
 								    uint64_t cycles_zero[PMD_N_CYCLES];
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								};
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
+								#define PMD_INITIAL_SEQ 1
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* Interface to netdev-based datapath. */
 								struct dpif_netdev {
 								    struct dpif dpif;
 								    struct dp_netdev *dp;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    uint64_t last_port_seq;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                              struct dp_netdev_port **portp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static int get_port_by_name(struct dp_netdev *dp, const char *devname,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                            struct dp_netdev_port **portp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static void dp_netdev_free(struct dp_netdev *)
 								    OVS_REQUIRES(dp_netdev_mutex);
 								static int do_add_port(struct dp_netdev *dp, const char *devname,
 								                       const char *type, odp_port_t port_no)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex);
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								static int dpif_netdev_open(const struct dpif_class *, const char *name,
 								                            bool create, struct dpif **);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                                      struct dp_packet_batch *,
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								                                      bool may_steal, const struct flow *flow,
-												datapath: Refactor actions in terms of match fields.

Almost all current actions can be expressed in the form of
push/pop/set <field>, where field is one of the match fields. We can
create three base actions and take a field. This has both a nice
symmetry and avoids inconsistencies where we can match on the vlan
TPID but not set it.
Following patch converts all actions to this new format.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

Bug #7115

											
										
										
											2011-10-21 14:38:54 -07:00
+								                                      const struct nlattr *actions,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                                      size_t actions_len,
 								                                      long long now);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                            struct dp_packet_batch *, odp_port_t port_no);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                                  struct dp_packet_batch *);
-												netdev-dpif: Add metadata to dpif-packet.

Today dpif-netdev has single metadat for given batch, since one
batch belongs to one port, but soon packets fro single tunnel ports
can belong to different ports, so we need to have per packet metadata.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 20:23:58 -07:00
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void dp_netdev_disable_upcall(struct dp_netdev *);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								                                    struct dp_netdev *dp, unsigned core_id,
 								                                    int numa_id);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								                                                      unsigned core_id);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static struct dp_netdev_pmd_thread *
 								dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
 								static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
 								                                             struct dp_netdev_port *port);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
 								                                       struct dp_netdev_port *port);
 								static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                                         struct dp_netdev_port *port);
 								static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                                     struct dp_netdev_port *port,
 								                                     struct netdev_rxq *rx);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static struct dp_netdev_pmd_thread *
 								dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								static void reconfigure_pmd_threads(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 								static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 								static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								    OVS_REQUIRES(pmd->port_mutex);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static inline void
 								dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								static void
 								dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 								                               long long now, bool purge);
 								static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 								                                      struct tx_port *tx, long long now);
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								static inline bool emc_entry_alive(struct emc_entry *ce);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static void emc_clear_entry(struct emc_entry *ce);
 								static void
 								emc_cache_init(struct emc_cache *flow_cache)
 								{
 								    int i;
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								    flow_cache->sweep_idx = 0;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 								        flow_cache->entries[i].flow = NULL;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        flow_cache->entries[i].key.hash = 0;
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								        flow_cache->entries[i].key.len = sizeof(struct miniflow);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								        flowmap_init(&flow_cache->entries[i].key.mf.map);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
 								static void
 								emc_cache_uninit(struct emc_cache *flow_cache)
 								{
 								    int i;
 								    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 								        emc_clear_entry(&flow_cache->entries[i]);
 								    }
 								}
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								/* Check and clear dead flow references slowly (one entry at each
 								 * invocation).  */
 								static void
 								emc_cache_slow_sweep(struct emc_cache *flow_cache)
 								{
 								    struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 								    if (!emc_entry_alive(entry)) {
 								        emc_clear_entry(entry);
 								    }
 								    flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 								}
-												dpif: Generalize test for dummy dpifs beyond the name.

When --enable-dummy=system or --enable-dummy=override is in use, dpifs
other than "dummy" are actually dummy dpifs, so use a more reliable test.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 15:08:31 -07:00
+								/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 								bool
 								dpif_is_netdev(const struct dpif *dpif)
 								{
 								    return dpif->dpif_class->open == dpif_netdev_open;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dpif_netdev *
 								dpif_netdev_cast(const struct dpif *dpif)
 								{
-												dpif: Generalize test for dummy dpifs beyond the name.

When --enable-dummy=system or --enable-dummy=override is in use, dpifs
other than "dummy" are actually dummy dpifs, so use a more reliable test.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 15:08:31 -07:00
+								    ovs_assert(dpif_is_netdev(dpif));
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 								}
 								static struct dp_netdev *
 								get_dp_netdev(const struct dpif *dpif)
 								{
 								    return dpif_netdev_cast(dpif)->dp;
 								}
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								enum pmd_info_type {
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 								    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
 								    PMD_INFO_SHOW_RXQ     /* Show poll-lists of pmd threads. */
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								};
 								static void
 								pmd_info_show_stats(struct ds *reply,
 								                    struct dp_netdev_pmd_thread *pmd,
 								                    unsigned long long stats[DP_N_STATS],
 								                    uint64_t cycles[PMD_N_CYCLES])
 								{
 								    unsigned long long total_packets = 0;
 								    uint64_t total_cycles = 0;
 								    int i;
 								    /* These loops subtracts reference values ('*_zero') from the counters.
 								     * Since loads and stores are relaxed, it might be possible for a '*_zero'
 								     * value to be more recent than the current value we're reading from the
 								     * counter.  This is not a big problem, since these numbers are not
 								     * supposed to be too accurate, but we should at least make sure that
 								     * the result is not negative. */
 								    for (i = 0; i < DP_N_STATS; i++) {
 								        if (stats[i] > pmd->stats_zero[i]) {
 								            stats[i] -= pmd->stats_zero[i];
 								        } else {
 								            stats[i] = 0;
 								        }
 								        if (i != DP_STAT_LOST) {
 								            /* Lost packets are already included in DP_STAT_MISS */
 								            total_packets += stats[i];
 								        }
 								    }
 								    for (i = 0; i < PMD_N_CYCLES; i++) {
 								        if (cycles[i] > pmd->cycles_zero[i]) {
 								           cycles[i] -= pmd->cycles_zero[i];
 								        } else {
 								            cycles[i] = 0;
 								        }
 								        total_cycles += cycles[i];
 								    }
 								    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
 								                        ? "main thread" : "pmd thread");
 								    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
 								        ds_put_format(reply, " numa_id %d", pmd->numa_id);
 								    }
-												netdev-dpdk: Properly support non pmd threads.

We used to reserve DPDK lcore 0 for non pmd operations, making it
difficult to use core 0 for packet processing.
DPDK 2.0 properly support non EAL threads with lcore LCORE_ID_ANY.

Using non EAL threads for non pmd threads, we do not need to reserve
any core for non pmd operations

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:20 +01:00
+								    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								        ds_put_format(reply, " core_id %u", pmd->core_id);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    }
 								    ds_put_cstr(reply, ":\n");
 								    ds_put_format(reply,
 								                  "\temc hits:%llu\n\tmegaflow hits:%llu\n"
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                  "\tavg. subtable lookups per hit:%.2f\n"
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								                  "\tmiss:%llu\n\tlost:%llu\n",
 								                  stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                  stats[DP_STAT_MASKED_HIT] > 0
 								                  ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
 								                  : 0,
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								                  stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
 								    if (total_cycles == 0) {
 								        return;
 								    }
 								    ds_put_format(reply,
 								                  "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
 								                  "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
 								                  cycles[PMD_CYCLES_POLLING],
 								                  cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
 								                  cycles[PMD_CYCLES_PROCESSING],
 								                  cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
 								    if (total_packets == 0) {
 								        return;
 								    }
 								    ds_put_format(reply,
 								                  "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
 								                  total_cycles / (double)total_packets,
 								                  total_cycles, total_packets);
 								    ds_put_format(reply,
 								                  "\tavg processing cycles per packet: "
 								                  "%.02f (%"PRIu64"/%llu)\n",
 								                  cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
 								                  cycles[PMD_CYCLES_PROCESSING], total_packets);
 								}
 								static void
 								pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
 								                    struct dp_netdev_pmd_thread *pmd,
 								                    unsigned long long stats[DP_N_STATS],
 								                    uint64_t cycles[PMD_N_CYCLES])
 								{
 								    int i;
 								    /* We cannot write 'stats' and 'cycles' (because they're written by other
 								     * threads) and we shouldn't change 'stats' (because they're used to count
 								     * datapath stats, which must not be cleared here).  Instead, we save the
 								     * current values and subtract them from the values to be displayed in the
 								     * future */
 								    for (i = 0; i < DP_N_STATS; i++) {
 								        pmd->stats_zero[i] = stats[i];
 								    }
 								    for (i = 0; i < PMD_N_CYCLES; i++) {
 								        pmd->cycles_zero[i] = cycles[i];
 								    }
 								}
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								static void
 								pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 								{
 								    if (pmd->core_id != NON_PMD_CORE_ID) {
 								        struct rxq_poll *poll;
 								        const char *prev_name = NULL;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        ds_put_format(reply,
 								                      "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
 								                      pmd->numa_id, pmd->core_id, (pmd->isolated)
 								                                                  ? "true" : "false");
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								        LIST_FOR_EACH (poll, node, &pmd->poll_list) {
 								            const char *name = netdev_get_name(poll->port->netdev);
 								            if (!prev_name || strcmp(name, prev_name)) {
 								                if (prev_name) {
 								                    ds_put_cstr(reply, "\n");
 								                }
 								                ds_put_format(reply, "\tport: %s\tqueue-id:",
 								                              netdev_get_name(poll->port->netdev));
 								            }
 								            ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
 								            prev_name = name;
 								        }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								        ds_put_cstr(reply, "\n");
 								    }
 								}
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								static void
 								dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
 								                     void *aux)
 								{
 								    struct ds reply = DS_EMPTY_INITIALIZER;
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct dp_netdev *dp = NULL;
 								    enum pmd_info_type type = *(enum pmd_info_type *) aux;
 								    ovs_mutex_lock(&dp_netdev_mutex);
 								    if (argc == 2) {
 								        dp = shash_find_data(&dp_netdevs, argv[1]);
 								    } else if (shash_count(&dp_netdevs) == 1) {
 								        /* There's only one datapath */
 								        dp = shash_first(&dp_netdevs)->data;
 								    }
 								    if (!dp) {
 								        ovs_mutex_unlock(&dp_netdev_mutex);
 								        unixctl_command_reply_error(conn,
 								                                    "please specify an existing datapath");
 								        return;
 								    }
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								        if (type == PMD_INFO_SHOW_RXQ) {
 								            pmd_info_show_rxq(&reply, pmd);
 								        } else {
 								            unsigned long long stats[DP_N_STATS];
 								            uint64_t cycles[PMD_N_CYCLES];
 								            int i;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								            /* Read current stats and cycle counters */
 								            for (i = 0; i < ARRAY_SIZE(stats); i++) {
 								                atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
 								            }
 								            for (i = 0; i < ARRAY_SIZE(cycles); i++) {
 								                atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
 								            }
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								            if (type == PMD_INFO_CLEAR_STATS) {
 								                pmd_info_clear_stats(&reply, pmd, stats, cycles);
 								            } else if (type == PMD_INFO_SHOW_STATS) {
 								                pmd_info_show_stats(&reply, pmd, stats, cycles);
 								            }
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								        }
 								    }
 								    ovs_mutex_unlock(&dp_netdev_mutex);
 								    unixctl_command_reply(conn, ds_cstr(&reply));
 								    ds_destroy(&reply);
 								}
 								static int
 								dpif_netdev_init(void)
 								{
 								    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								                              clear_aux = PMD_INFO_CLEAR_STATS,
 								                              poll_aux = PMD_INFO_SHOW_RXQ;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								    unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
 , 1, dpif_netdev_pmd_info,
 								                             (void *)&show_aux);
 								    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
 , 1, dpif_netdev_pmd_info,
 								                             (void *)&clear_aux);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
 , 1, dpif_netdev_pmd_info,
 								                             (void *)&poll_aux);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								static int
-												dpif-netdev: enumerate dpif belonging to the right class

Since dpif_netdev_enumerate() is used for "netdev" and "dummy" class, it
incorrectly lists dpif-netdevs as "dummy" and vice versa.
This patches address the issue by changing the dpif-provider interface: a
dpif_class parameter is passed to the 'enumerate' call to match the right class.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-12 16:37:33 -07:00
+								dpif_netdev_enumerate(struct sset *all_dps,
 								                      const struct dpif_class *dpif_class)
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								{
 								    struct shash_node *node;
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    SHASH_FOR_EACH(node, &dp_netdevs) {
-												dpif-netdev: enumerate dpif belonging to the right class

Since dpif_netdev_enumerate() is used for "netdev" and "dummy" class, it
incorrectly lists dpif-netdevs as "dummy" and vice versa.
This patches address the issue by changing the dpif-provider interface: a
dpif_class parameter is passed to the 'enumerate' call to match the right class.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-12 16:37:33 -07:00
+								        struct dp_netdev *dp = node->data;
 								        if (dpif_class != dp->class) {
 								            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
 								             * If the class doesn't match, skip this dpif. */
 								             continue;
 								        }
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								        sset_add(all_dps, node->name);
 								    }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    return 0;
 								}
-												dpif-netdev: Allow stub interfaces on the dummy datapath.

Future patches will need to add netdevs to the dummy datapath which
can't actually send or receive packets.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-01-08 14:37:23 -08:00
+								static bool
 								dpif_netdev_class_is_dummy(const struct dpif_class *class)
 								{
 								    return class != &dpif_netdev_class;
 								}
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								static const char *
 								dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
 								{
 								    return strcmp(type, "internal") ? type
-												netdev-dummy: Add dummy-internal class.

"internal" netdevs are treated specially in OVS (e.g. for MTU), but
the dummy datapath remaps both "system" and "internal" devices to the
same "dummy" netdev class, so there's no way to discern those in tests.

This commit adds a new "dummy-internal" netdev type, which will be used
by the dummy datapath for internal ports, so that other parts of the
code can understand which ports are internal just by looking at the
netdev object.

The alternative solution, using the original interface type ("internal")
instead of the translated netdev type ("dummy"), is harder to implement,
because in so many places only the netdev object is available.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-02-11 13:11:10 -08:00
+								                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								                  : "tap";
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dpif *
 								create_dpif_netdev(struct dp_netdev *dp)
 								{
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    uint16_t netflow_id = hash_string(dp->name, 0);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dpif_netdev *dpif;
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    ovs_refcount_ref(&dp->ref_cnt);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    dpif = xmalloc(sizeof *dpif);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif->dp = dp;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    dpif->last_port_seq = seq_read(dp->port_seq);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    return &dpif->dpif;
 								}
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								/* Choose an unused, non-zero port number and return it on success.
 								 * Return ODPP_NONE on failure. */
 								static odp_port_t
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								choose_port(struct dp_netdev *dp, const char *name)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								{
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    uint32_t port_no;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
 								    if (dp->class != &dpif_netdev_class) {
 								        const char *p;
 								        int start_no = 0;
 								        /* If the port name begins with "br", start the number search at
 								         * 100 to make writing tests easier. */
 								        if (!strncmp(name, "br", 2)) {
 								            start_no = 100;
 								        }
 								        /* If the port name contains a number, try to assign that port number.
 								         * This can make writing unit tests easier because port numbers are
 								         * predictable. */
 								        for (p = name; *p != '\0'; p++) {
 								            if (isdigit((unsigned char) *p)) {
 								                port_no = start_no + strtol(p, NULL, 10);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
 								                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                    return u32_to_odp(port_no);
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								                }
 								                break;
 								            }
 								        }
 								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
 								        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								            return u32_to_odp(port_no);
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								        }
 								    }
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    return ODPP_NONE;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								create_dp_netdev(const char *name, const struct dpif_class *class,
 								                 struct dp_netdev **dpp)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    OVS_REQUIRES(dp_netdev_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp;
 								    int error;
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    dp = xzalloc(sizeof *dp);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    shash_add(&dp_netdevs, name, dp);
 								    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
 								    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    ovs_refcount_init(&dp->ref_cnt);
-												dpif-netdev: init atomic flag dp->destroyed

It is better to explicitly initialize the dp->destroy than to rely
on xzalloc().

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-17 22:10:53 -07:00
+								    atomic_flag_clear(&dp->destroyed);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_init(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_init(&dp->ports);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    dp->port_seq = seq_create();
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    fat_rwlock_init(&dp->upcall_rwlock);
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    dp->reconfigure_seq = seq_create();
 								    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    /* Disable upcalls by default. */
 								    dp_netdev_disable_upcall(dp);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    dp->upcall_aux = NULL;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dp->upcall_cb = NULL;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    conntrack_init(&dp->conntrack);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    cmap_init(&dp->poll_threads);
 								    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
 								    ovsthread_key_create(&dp->per_pmd_key, NULL);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    dp_netdev_set_nonpmd(dp);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
 								                                                             "internal"),
 								                        ODPP_LOCAL);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    if (error) {
 								        dp_netdev_free(dp);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								        return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    *dpp = dp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								static void
 								dp_netdev_request_reconfigure(struct dp_netdev *dp)
 								{
 								    seq_change(dp->reconfigure_seq);
 								}
 								static bool
 								dp_netdev_is_reconf_required(struct dp_netdev *dp)
 								{
 								    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								dpif_netdev_open(const struct dpif_class *class, const char *name,
-												dpif: Make dpif_class 'open' function take class instead of type name.

This makes it easier for dpif_provider implementations to share code but
distinguish the class actually in use, because comparing a pointer is
easier than comparing a string.

											
										
										
											2010-11-18 10:06:41 -08:00
+								                 bool create, struct dpif **dpifp)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    struct dp_netdev *dp;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    dp = shash_find_data(&dp_netdevs, name);
 								    if (!dp) {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = (dp->class != class ? EINVAL
 								                 : create ? EEXIST
 								                 : 0);
 								    }
 								    if (!error) {
 								        *dpifp = create_dpif_netdev(dp);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								        dp->dpif = *dpifp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: fix dp_netdev_free()

dp_netdev_free() must free 'dp->upcall_rwlock', but when upcalls are
disabled (if the datapath is being freed upcalls should be disabled)
'dp->upcall_rwlock' is taken and freeing it causes an assertion to
fail.

This commit takes makes sure that the upcalls are disabled and
releases 'dp->upcall_rwlock' before freeing it. A simple testcase is
added to detect the failure.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 15:04:15 -07:00
+								static void
 								dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
 								    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
 								    /* Before freeing a lock we should release it */
 								    fat_rwlock_unlock(&dp->upcall_rwlock);
 								    fat_rwlock_destroy(&dp->upcall_rwlock);
 								}
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
 								 * through the 'dp_netdevs' shash while freeing 'dp'. */
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								static void
 								dp_netdev_free(struct dp_netdev *dp)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    OVS_REQUIRES(dp_netdev_mutex)
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct dp_netdev_port *port, *next;
-												dpif-netdev: Avoid pointlessly maintaining a port count.

'n_ports' was only used for testing for nonzero, and we can rewrite the
code that does that to more straightforwardly use LIST_FOR_EACH_SAFE.

											
										
										
											2011-08-10 12:40:10 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    shash_find_and_delete(&dp_netdevs, dp->name);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    dp_netdev_destroy_all_pmds(dp);
 								    ovs_mutex_destroy(&dp->non_pmd_mutex);
 								    ovsthread_key_delete(dp->per_pmd_key);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    conntrack_destroy(&dp->conntrack);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								        do_del_port(dp, port);
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    cmap_destroy(&dp->poll_threads);
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    seq_destroy(dp->reconfigure_seq);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_destroy(dp->port_seq);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_destroy(&dp->ports);
-												dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 18:10:33 -07:00
+								    ovs_mutex_destroy(&dp->port_mutex);
-												dpif-netdev: fix dp_netdev_free()

dp_netdev_free() must free 'dp->upcall_rwlock', but when upcalls are
disabled (if the datapath is being freed upcalls should be disabled)
'dp->upcall_rwlock' is taken and freeing it causes an assertion to
fail.

This commit takes makes sure that the upcalls are disabled and
releases 'dp->upcall_rwlock' before freeing it. A simple testcase is
added to detect the failure.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 15:04:15 -07:00
 								    /* Upcalls must be disabled at this point */
 								    dp_netdev_destroy_upcall_lock(dp);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    free(dp->pmd_cmask);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    free(CONST_CAST(char *, dp->name));
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    free(dp);
 								}
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static void
 								dp_netdev_unref(struct dp_netdev *dp)
 								{
 								    if (dp) {
 								        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
 								         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
 								        ovs_mutex_lock(&dp_netdev_mutex);
-												Use ovs_refcount_unref_relaxed.

After a quick analysis, in most cases the access to refcounted objects
is clearly protected either with an explicit lock/mutex, or RCU. there
are only a few places where I left a call to ovs_refcount_unref().
Upon closer analysis it may well be that those could also use the
relaxed form.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-07 13:18:46 -07:00
+								        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								            dp_netdev_free(dp);
 								        }
 								        ovs_mutex_unlock(&dp_netdev_mutex);
 								    }
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
 								dpif_netdev_close(struct dpif *dpif)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    dp_netdev_unref(dp);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    free(dpif);
 								}
 								static int
-												Fix some regressions from the merge from master.

											
										
										
											2010-02-08 13:22:41 -05:00
+								dpif_netdev_destroy(struct dpif *dpif)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    if (!atomic_flag_test_and_set(&dp->destroyed)) {
-												Use ovs_refcount_unref_relaxed.

After a quick analysis, in most cases the access to refcounted objects
is clearly protected either with an explicit lock/mutex, or RCU. there
are only a few places where I left a call to ovs_refcount_unref().
Upon closer analysis it may well be that those could also use the
relaxed form.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-07 13:18:46 -07:00
+								        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
 								            OVS_NOT_REACHED();
 								        }
 								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
 								 * load/store semantics.  While the increment is not atomic, the load and
 								 * store operations are, making it impossible to read inconsistent values.
 								 *
 								 * This is used to update thread local stats counters. */
 								static void
 								non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
 								{
 								    unsigned long long tmp;
 								    atomic_read_relaxed(var, &tmp);
 								    tmp += n;
 								    atomic_store_relaxed(var, tmp);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif: Avoid use of  "struct ovs_dp_stats" in platform-independent modules.

Over time we wish to reduce the number of datapath-protocol.h definitions
used directly outside of Linux-specific code.  This commit removes use of
"struct ovs_dp_stats" from platform-independent code.

Bug #7559.

											
										
										
											2011-10-05 11:18:13 -07:00
+								dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								        unsigned long long n;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        stats->n_flows += cmap_count(&pmd->flow_table);
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
-												dpif-netdev: Count exact match cache hits.

We used to count exact match cache hits and masked classifier hits
together. This commit splits the DP_STAT_HIT counter into two.
This change will be used by future commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:15 +01:00
+								        atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
 								        stats->n_hit += n;
 								        atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								        stats->n_hit += n;
 								        atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
 								        stats->n_missed += n;
 								        atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
 								        stats->n_lost += n;
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								    }
-												dpif-linux: fix the size of n_masks

The command ovs-dpctl can wrongly output the masks even if the
datapath does not implement mega flows. In this case the output
will be similar to the following:

system@ovs-system:
	lookups: hit:14 missed:41 lost:0
	flows: 0
	masks: hit:18446744073709551615 total:4294967295
		hit/pkt:335395346794719104.00
	port 0: ovs-system (internal)
	port 1: gre_system (gre: df_default=false, ttl=0)
	port 2: ots-br0 (internal)
	port 3: int0 (internal)
	port 4: vnet0
	port 5: vnet1

The problem depends on the fact that n_masks stats is stored as a
uint32 in the struct ovs_dp_megaflow_stats and as a uint64 in the
struct dpif_dp_stats. UINT32_MAX instead of UINT64_MAX should be
used to detect if the datapath supports megaflows or not.

Signed-off-by: Francesco Fusco <ffusco@redhat.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-17 20:18:18 +01:00
+								    stats->n_masks = UINT32_MAX;
-												dpif-linux: collect and display mega flow mask stats

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2013-10-21 14:37:34 -07:00
+								    stats->n_mask_hit = UINT64_MAX;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    int old_seq;
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
 								        ovs_mutex_lock(&pmd->port_mutex);
 								        pmd_load_cached_ports(pmd);
 								        ovs_mutex_unlock(&pmd->port_mutex);
 								        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								        return;
 								    }
 								    ovs_mutex_lock(&pmd->cond_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
 								    ovs_mutex_unlock(&pmd->cond_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								static uint32_t
 								hash_port_no(odp_port_t port_no)
 								{
 								    return hash_int(odp_to_u32(port_no), 0);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								port_create(const char *devname, const char *type,
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								            odp_port_t port_no, struct dp_netdev_port **portp)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    struct netdev_saved_flags *sf;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dp_netdev_port *port;
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    enum netdev_flags flags;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    struct netdev *netdev;
 								    int n_open_rxqs = 0;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    int n_cores = 0;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    int i, error;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    bool dynamic_txqs = false;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    *portp = NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    /* Open and validate network device. */
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = netdev_open(devname, type, &netdev);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    if (error) {
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								        return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
 								    /* XXX reject non-Ethernet devices */
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    netdev_get_flags(netdev, &flags);
 								    if (flags & NETDEV_LOOPBACK) {
 								        VLOG_ERR("%s: cannot add a loopback device", devname);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								        error = EINVAL;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								        goto out;
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    }
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								    if (netdev_is_pmd(netdev)) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        n_cores = ovs_numa_get_n_cores();
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
 								        if (n_cores == OVS_CORE_UNSPEC) {
 								            VLOG_ERR("%s, cannot get cpu core info", devname);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								            error = ENOENT;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								            goto out;
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								        }
 								        /* There can only be ovs_numa_get_n_cores() pmd threads,
-												dpif-netdev: Fix non-pmd thread queue id.

Non pmd threads have a core_id == UINT32_MAX, while queue ids used by
netdevs range from 0 to the number of CPUs.  Therefore core ids cannot
be used directly to select a queue.

This commit introduces a simple mapping to fix the problem: pmd threads
continue using queues 0 to N (where N is the number of CPUs in the
system), while non pmd threads use queue N+1.

Fixes: d5c199ea7ff7 ("netdev-dpdk: Properly support non pmd threads.")

Reported-by: 차은호 <eunho.cha@atto-research.com
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Mark D. Gray <mark.d.gray@intel.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-03 15:55:16 +01:00
+								         * so creates a txq for each, and one extra for the non
 								         * pmd threads. */
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								        error = netdev_set_tx_multiq(netdev, n_cores + 1);
-												netdev-dpdk: Fix DPDK rings broken by multi queue

DPDK rings don't need one queue per PMD thread and don't support multiple
queues (set_multiq function is undefined). To fix operation with DPDK rings,
this patch ignores EOPNOTSUPP error on netdev_set_multiq() and provides, for
DPDK rings, a netdev send() function that ignores the provided queue id
(= PMD thread core id).

Suggested-by: Maryam Tahhan <maryam.tahhan@intel.com>
Signed-off-by: David Verbeiren <david.verbeiren@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-14 19:01:49 +02:00
+								        if (error && (error != EOPNOTSUPP)) {
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								            VLOG_ERR("%s, cannot set multiq", devname);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								            goto out;
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								        }
 								    }
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
 								    if (netdev_is_reconf_required(netdev)) {
 								        error = netdev_reconfigure(netdev);
 								        if (error) {
 								            goto out;
 								        }
 								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    if (netdev_is_pmd(netdev)) {
 								        if (netdev_n_txq(netdev) < n_cores + 1) {
 								            dynamic_txqs = true;
 								        }
 								    }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    port = xzalloc(sizeof *port);
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    port->port_no = port_no;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    port->netdev = netdev;
-												dpif-netdev: Keep count of elements in port->rxq[].

This will ease deleting a port with no open rxqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 14:25:33 -08:00
+								    port->n_rxq = netdev_n_rxq(netdev);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    port->type = xstrdup(type);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    ovs_mutex_init(&port->txq_used_mutex);
 								    port->dynamic_txqs = dynamic_txqs;
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
-												dpif-netdev: Keep count of elements in port->rxq[].

This will ease deleting a port with no open rxqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 14:25:33 -08:00
+								    for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        error = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								        if (error) {
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								            VLOG_ERR("%s: cannot receive packets on this network device (%s)",
 								                     devname, ovs_strerror(errno));
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								            goto out_rxq_close;
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								        }
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								        port->rxqs[i].core_id = OVS_CORE_UNSPEC;
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								        n_open_rxqs++;
-												netdev: Clean up and refactor packet receive interface.

The Open vSwitch tree only has one user of the ability for a netdev to
receive packets from a network device.  Thus, this commit simplifies the
common-case use of the netdev interface by replacing the "ethertype" option
from "struct netdev_options" by a new netdev_listen() call.

The only user of netdev_listen() wants to receive all packets from a
network device, so this commit also removes the ability to restrict the
received packets to a particular protocol.  (This ability was once used by
the Open vSwitch integrated DHCP client, but that code has been removed.)

This commit also simplifies and improves the implementation of the code
in netdev-linux that started listening to a network device.  Before, I had
not figured out how to avoid receiving all packets on all devices before
binding to a particular device, but I took a closer look at the kernel code
and figured it out.

I've tested that the userspace datapath (dpif-netdev), the only user of
netdev_recv(), still works after this change.

											
										
										
											2011-08-05 14:15:32 -07:00
+								    }
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    if (error) {
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								        goto out_rxq_close;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    port->sf = sf;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    *portp = port;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    return 0;
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
 								out_rxq_close:
 								    for (i = 0; i < n_open_rxqs; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        netdev_rxq_close(port->rxqs[i].rxq);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    ovs_mutex_destroy(&port->txq_used_mutex);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								    free(port->type);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    free(port->txq_used);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    free(port->rxqs);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								    free(port);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								out:
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    netdev_close(netdev);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								static int
 								do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
 								            odp_port_t port_no)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    int error;
 								    /* Reject devices already in 'dp'. */
 								    if (!get_port_by_name(dp, devname, &port)) {
 								        return EEXIST;
 								    }
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = port_create(devname, type, port_no, &port);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    if (error) {
 								        return error;
 								    }
 								    if (netdev_is_pmd(port->netdev)) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        int numa_id = netdev_get_numa_id(port->netdev);
 								        ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
 								        dp_netdev_set_pmds_on_numa(dp, numa_id);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    dp_netdev_add_port_to_pmds(dp, port);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    seq_change(dp->port_seq);
 								    return 0;
 								}
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								static int
 								dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                     odp_port_t *port_nop)
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
 								    const char *dpif_port;
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    odp_port_t port_no;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    if (*port_nop != ODPP_NONE) {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        port_no = *port_nop;
 								        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
-												dpif: Allow the port number to be requested when adding an interface.

The datapath allows requesting a specific port number for a port, but
the dpif interface didn't expose it.  This commit adds that support.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-07-27 23:58:24 -07:00
+								    } else {
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								        port_no = choose_port(dp, dpif_port);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = port_no == ODPP_NONE ? EFBIG : 0;
-												dpif: Allow the port number to be requested when adding an interface.

The datapath allows requesting a specific port number for a port, but
the dpif interface didn't expose it.  This commit adds that support.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-07-27 23:58:24 -07:00
+								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    if (!error) {
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								        *port_nop = port_no;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								    if (port_no == ODPP_LOCAL) {
 								        error = EINVAL;
 								    } else {
 								        struct dp_netdev_port *port;
 								        error = get_port_by_number(dp, port_no, &port);
 								        if (!error) {
 								            do_del_port(dp, port);
 								        }
 								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static bool
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								is_valid_port_number(odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    return port_no != ODPP_NONE;
 								}
 								static struct dp_netdev_port *
 								dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								        if (port->port_no == port_no) {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								            return port;
 								        }
 								    }
 								    return NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
 								get_port_by_number(struct dp_netdev *dp,
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                   odp_port_t port_no, struct dp_netdev_port **portp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    if (!is_valid_port_number(port_no)) {
 								        *portp = NULL;
 								        return EINVAL;
 								    } else {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        *portp = dp_netdev_lookup_port(dp, port_no);
-												dpif: Return ENODEV from dpif_port_query_by_*() if there's no port.

bridge_delete_or_reconfigure() deletes every interface that's not dumped
by OFPROTO_PORT_FOR_EACH().  ofproto_dpif.c:port_dump_next(), used by
OFPROTO_PORT_FOR_EACH, checks if the ofport is in the datapath by
calling port_query_by_name().  If port_query_by_name() returns an error,
the dump is interrupted.  If port_query_by_name() returns ENODEV, the
device doesn't exist and the dump can continue.

port_query_by_name() for the userspace datapath returns ENOENT instead
of ENODEV.  This is expected by dpif_port_query_by_name(), but it's not
handled correctly by port_dump_next().

dpif-netdev handles reconfiguration errors for an interface by deleting
it from the datapath, so it's possible that a device is missing. When this
happens we must make sure that port_dump_next() continues to dump other
devices, otherwise they will be deleted and the two layers will have an
inconsistent view.

This commit fixes the problem by returning ENODEV from the userspace
datapath if the port doesn't exist, and by documenting this clearly in
the dpif interfaces.

The problem was found while developing new code.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-05 20:21:23 -08:00
+								        return *portp ? 0 : ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
 								}
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								static void
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								port_destroy(struct dp_netdev_port *port)
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								{
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    if (!port) {
 								        return;
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								    }
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    netdev_close(port->netdev);
 								    netdev_restore_flags(port->sf);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    for (unsigned i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        netdev_rxq_close(port->rxqs[i].rxq);
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    ovs_mutex_destroy(&port->txq_used_mutex);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    free(port->rxq_affinity_list);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    free(port->txq_used);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    free(port->rxqs);
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    free(port->type);
 								    free(port);
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
 								get_port_by_name(struct dp_netdev *dp,
 								                 const char *devname, struct dp_netdev_port **portp)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Don't run port names through netdev_vport_get_dpif_port().

The ports that exist within a dpif have already been translated through
netdev_vport_get_dpif_port(), so there is no value to translating them
again in the interfaces that query or dump ports (and possibly a drawback
if somehow the translation could change).

After this change, dpif-netdev translates port names in just one place,
the port_add path, which makes dpif-netdev act the same way as dpif-linux
in this respect.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-06 15:27:15 -07:00
+								        if (!strcmp(netdev_get_name(port->netdev), devname)) {
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            *portp = port;
 								            return 0;
 								        }
 								    }
-												dpif: Return ENODEV from dpif_port_query_by_*() if there's no port.

bridge_delete_or_reconfigure() deletes every interface that's not dumped
by OFPROTO_PORT_FOR_EACH().  ofproto_dpif.c:port_dump_next(), used by
OFPROTO_PORT_FOR_EACH, checks if the ofport is in the datapath by
calling port_query_by_name().  If port_query_by_name() returns an error,
the dump is interrupted.  If port_query_by_name() returns ENODEV, the
device doesn't exist and the dump can continue.

port_query_by_name() for the userspace datapath returns ENOENT instead
of ENODEV.  This is expected by dpif_port_query_by_name(), but it's not
handled correctly by port_dump_next().

dpif-netdev handles reconfiguration errors for an interface by deleting
it from the datapath, so it's possible that a device is missing. When this
happens we must make sure that port_dump_next() continues to dump other
devices, otherwise they will be deleted and the two layers will have an
inconsistent view.

This commit fixes the problem by returning ENODEV from the userspace
datapath if the port doesn't exist, and by documenting this clearly in
the dpif interfaces.

The problem was found while developing new code.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-05 20:21:23 -08:00
 								    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
 								     * existing port. */
 								    return ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								static int
 								get_n_pmd_threads(struct dp_netdev *dp)
 								{
 								    /* There is one non pmd thread in dp->poll_threads */
 								    return cmap_count(&dp->poll_threads) - 1;
 								}
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static int
 								get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    int n_pmds = 0;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (pmd->numa_id == numa_id) {
 								            n_pmds++;
 								        }
 								    }
 								    return n_pmds;
 								}
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								/* Returns 'true' if there is a port with pmd netdev and the netdev is on
 								 * numa node 'numa_id' or its rx queue assigned to core on that numa node . */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static bool
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								has_pmd_rxq_for_numa(struct dp_netdev *dp, int numa_id)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								        if (netdev_is_pmd(port->netdev)) {
 								            int i;
 								            if (netdev_get_numa_id(port->netdev) == numa_id) {
 								                return true;
 								            }
 								            for (i = 0; i < port->n_rxq; i++) {
 								                unsigned core_id = port->rxqs[i].core_id;
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								                if (core_id != OVS_CORE_UNSPEC
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								                    && ovs_numa_get_numa_id(core_id) == numa_id) {
 								                    return true;
 								                }
 								            }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        }
 								    }
 								    return false;
 								}
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								static void
 								do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_remove(&dp->ports, &port->node);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_change(dp->port_seq);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    dp_netdev_del_port_from_all_pmds(dp, port);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    if (netdev_is_pmd(port->netdev)) {
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        int numa_id = netdev_get_numa_id(port->netdev);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								        /* PMD threads can not be on invalid numa node. */
 								        ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        /* If there is no netdev on the numa node, deletes the pmd threads
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								         * for that numa. */
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								        if (!has_pmd_rxq_for_numa(dp, numa_id)) {
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								            dp_netdev_del_pmds_on_numa(dp, numa_id);
 								        }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    port_destroy(port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								answer_port_query(const struct dp_netdev_port *port,
 								                  struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Don't run port names through netdev_vport_get_dpif_port().

The ports that exist within a dpif have already been translated through
netdev_vport_get_dpif_port(), so there is no value to translating them
again in the interfaces that query or dump ports (and possibly a drawback
if somehow the translation could change).

After this change, dpif-netdev translates port names in just one place,
the port_add path, which makes dpif-netdev act the same way as dpif-linux
in this respect.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-06 15:27:15 -07:00
+								    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								    dpif_port->type = xstrdup(port->type);
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    dpif_port->port_no = port->port_no;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                                 struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    error = get_port_by_number(dp, port_no, &port);
-												dpif: Add new dpif_port_exists() function.

Provide the ability to determine whether a port exists in a datapath
without having to deal with a "dpif_port" structure as with
dpif_port_query_by_name().  A future patch will use this function.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-17 23:11:53 -07:00
+								    if (!error && dpif_port) {
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								        answer_port_query(port, dpif_port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return error;
 								}
 								static int
 								dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                               struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error;
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    error = get_port_by_name(dp, devname, &port);
-												dpif: Add new dpif_port_exists() function.

Provide the ability to determine whether a port exists in a datapath
without having to deal with a "dpif_port" structure as with
dpif_port_query_by_name().  A future patch will use this function.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-17 23:11:53 -07:00
+								    if (!error && dpif_port) {
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								        answer_port_query(port, dpif_port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return error;
 								}
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								static void
 								dp_netdev_flow_free(struct dp_netdev_flow *flow)
 								{
 								    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
 								    free(flow);
 								}
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
 								{
 								    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
 								        ovsrcu_postpone(dp_netdev_flow_free, flow);
 								    }
 								}
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								static uint32_t
 								dp_netdev_flow_hash(const ovs_u128 *ufid)
 								{
 								    return ufid->u32[0];
 								}
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static inline struct dpcls *
 								dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
 								                           odp_port_t in_port)
 								{
 								    struct dpcls *cls;
 								    uint32_t hash = hash_port_no(in_port);
 								    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
 								        if (cls->in_port == in_port) {
 								            /* Port classifier exists already */
 								            return cls;
 								        }
 								    }
 								    return NULL;
 								}
 								static inline struct dpcls *
 								dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
 								                         odp_port_t in_port)
 								    OVS_REQUIRES(pmd->flow_mutex)
 								{
 								    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    uint32_t hash = hash_port_no(in_port);
 								    if (!cls) {
 								        /* Create new classifier for in_port */
 								        cls = xmalloc(sizeof(*cls));
 								        dpcls_init(cls);
 								        cls->in_port = in_port;
 								        cmap_insert(&pmd->classifiers, &cls->node, hash);
 								        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
 								    }
 								    return cls;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
 								                          struct dp_netdev_flow *flow)
 								    OVS_REQUIRES(pmd->flow_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use cmap instead of hmap.

This requires less locking and makes introducing lockless classifier
lookups possible.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-04 06:38:47 -07:00
+								    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
 								    odp_port_t in_port = flow->flow.in_port.odp_port;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    ovs_assert(cls != NULL);
 								    dpcls_remove(cls, &flow->cr);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    flow->dead = true;
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
 								    dp_netdev_flow_unref(flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												cmap, classifier: Avoid unsafe aliasing in iterators.

CMAP_FOR_EACH and CLS_FOR_EACH and their variants tried to use void ** as
a "pointer to any kind of pointer".  That is a violation of the aliasing
rules in ISO C which technically yields undefined behavior.  With GCC 4.1,
it causes both warnings and actual misbehavior.  One option would to add
-fno-strict-aliasing to the compiler flags, but that would only help with
GCC; who knows whether this can be worked around with other compilers.

Instead, this commit rewrites the iterators to avoid disallowed pointer
aliasing.

VMware-BZ: #1287651
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-07-21 21:00:04 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_lock(&pmd->flow_mutex);
 								    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
 								        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
 								dpif_netdev_flow_flush(struct dpif *dpif)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        dp_netdev_pmd_flow_flush(pmd);
 								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								struct dp_netdev_port_state {
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_position position;
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								    char *name;
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								};
 								static int
 								dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
 								{
 								    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
 								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                           struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    struct dp_netdev_port_state *state = state_;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_node *node;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    int retval;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
 								    node = hmap_at_position(&dp->ports, &state->position);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    if (node) {
 								        struct dp_netdev_port *port;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        port = CONTAINER_OF(node, struct dp_netdev_port, node);
 								        free(state->name);
 								        state->name = xstrdup(netdev_get_name(port->netdev));
 								        dpif_port->name = state->name;
 								        dpif_port->type = port->type;
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								        dpif_port->port_no = port->port_no;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
 								        retval = 0;
 								    } else {
 								        retval = EOF;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    return retval;
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								}
 								static int
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								{
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								    struct dp_netdev_port_state *state = state_;
 								    free(state->name);
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    free(state);
 								    return 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Rename UNUSED macro to OVS_UNUSED to avoid naming conflict.

Requested by Jean Tourrilhes <jt@hpl.hp.com>.

											
										
										
											2010-02-11 10:59:47 -08:00
+								dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    uint64_t new_port_seq;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    new_port_seq = seq_read(dpif->dp->port_seq);
 								    if (dpif->last_port_seq != new_port_seq) {
 								        dpif->last_port_seq = new_port_seq;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = ENOBUFS;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = EAGAIN;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
 								dpif_netdev_port_poll_wait(const struct dpif *dpif_)
 								{
 								    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								}
 								static struct dp_netdev_flow *
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								dp_netdev_flow_cast(const struct dpcls_rule *cr)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								{
 								    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
 								{
 								    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
 								}
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								/* netdev_flow_key utilities.
 								 *
 								 * netdev_flow_key is basically a miniflow.  We use these functions
 								 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
 								 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
 								 *
 								 * - Since we are dealing exclusively with miniflows created by
 								 *   miniflow_extract(), if the map is different the miniflow is different.
 								 *   Therefore we can be faster by comparing the map and the miniflow in a
 								 *   single memcmp().
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								 * - These functions can be inlined by the compiler. */
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Given the number of bits set in miniflow's maps, returns the size of the
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								 * 'netdev_flow_key.mf' */
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								static inline size_t
 								netdev_flow_key_size(size_t flow_u64s)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								    return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
 								static inline bool
 								netdev_flow_key_equal(const struct netdev_flow_key *a,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                      const struct netdev_flow_key *b)
 								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    /* 'b->len' may be not set yet. */
 								    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
-												dpif-netdev: Fix typo in comment.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2016-03-03 20:43:20 -08:00
+								 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * generated by miniflow_extract. */
 								static inline bool
 								netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
 								                         const struct miniflow *mf)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    return !memcmp(&key->mf, mf, key->len);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
 								static inline void
 								netdev_flow_key_clone(struct netdev_flow_key *dst,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                      const struct netdev_flow_key *src)
 								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    memcpy(dst, src,
 								           offsetof(struct netdev_flow_key, mf) + src->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								/* Slow. */
 								static void
 								netdev_flow_key_from_flow(struct netdev_flow_key *dst,
 								                          const struct flow *src)
 								{
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    struct dp_packet packet;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint64_t buf_stub[512 / 8];
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
 								    pkt_metadata_from_flow(&packet.md, src);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    flow_compose(&packet, src);
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    miniflow_extract(&packet, &dst->mf);
 								    dp_packet_uninit(&packet);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								    dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    dst->hash = 0; /* Not computed yet. */
 								}
 								/* Initialize a netdev_flow_key 'mask' from 'match'. */
 								static inline void
 								netdev_flow_mask_init(struct netdev_flow_key *mask,
 								                      const struct match *match)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    uint64_t *dst = miniflow_values(&mask->mf);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    struct flowmap fmap;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    size_t idx;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    /* Only check masks that make sense for the flow. */
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    flow_wc_map(&match->flow, &fmap);
 								    flowmap_init(&mask->mf.map);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
 								        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								        if (mask_u64) {
 								            flowmap_set(&mask->mf.map, idx, 1);
 								            *dst++ = mask_u64;
 								            hash = hash_add64(hash, mask_u64);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        }
 								    }
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    map_t map;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
 								        hash = hash_add64(hash, map);
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    size_t n = dst - miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								    mask->hash = hash_finish(hash, n * 8);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    mask->len = netdev_flow_key_size(n);
 								}
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static inline void
 								netdev_flow_key_init_masked(struct netdev_flow_key *dst,
 								                            const struct flow *flow,
 								                            const struct netdev_flow_key *mask)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    uint64_t *dst_u64 = miniflow_values(&dst->mf);
 								    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    dst->len = mask->len;
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								    dst->mf = mask->mf;   /* Copy maps. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								        *dst_u64 = value & *mask_u64++;
 								        hash = hash_add64(hash, *dst_u64++);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    dst->hash = hash_finish(hash,
 								                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
 								#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
 								    MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Returns a hash value for the bits of 'key' where there are 1-bits in
 								 * 'mask'. */
 								static inline uint32_t
 								netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
 								                             const struct netdev_flow_key *mask)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    const uint64_t *p = miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
 								        hash = hash_add64(hash, value & *p++);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline bool
 								emc_entry_alive(struct emc_entry *ce)
 								{
 								    return ce->flow && !ce->flow->dead;
 								}
 								static void
 								emc_clear_entry(struct emc_entry *ce)
 								{
 								    if (ce->flow) {
 								        dp_netdev_flow_unref(ce->flow);
 								        ce->flow = NULL;
 								    }
 								}
 								static inline void
 								emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                 const struct netdev_flow_key *key)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
 								    if (ce->flow != flow) {
 								        if (ce->flow) {
 								            dp_netdev_flow_unref(ce->flow);
 								        }
 								        if (dp_netdev_flow_ref(flow)) {
 								            ce->flow = flow;
 								        } else {
 								            ce->flow = NULL;
 								        }
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    if (key) {
 								        netdev_flow_key_clone(&ce->key, key);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
 								static inline void
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								           struct dp_netdev_flow *flow)
 								{
 								    struct emc_entry *to_be_replaced = NULL;
 								    struct emc_entry *current_entry;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
 								        if (netdev_flow_key_equal(&current_entry->key, key)) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            /* We found the entry with the 'mf' miniflow */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            emc_change_entry(current_entry, flow, NULL);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            return;
 								        }
 								        /* Replacement policy: put the flow in an empty (not alive) entry, or
 								         * in the first entry where it can be */
 								        if (!to_be_replaced
 								            || (emc_entry_alive(to_be_replaced)
 								                && !emc_entry_alive(current_entry))
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            || current_entry->key.hash < to_be_replaced->key.hash) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            to_be_replaced = current_entry;
 								        }
 								    }
 								    /* We didn't find the miniflow in the cache.
 								     * The 'to_be_replaced' entry is where the new flow will be stored */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    emc_change_entry(to_be_replaced, flow, key);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
 								static inline struct dp_netdev_flow *
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
 								    struct emc_entry *current_entry;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
 								        if (current_entry->key.hash == key->hash
 								            && emc_entry_alive(current_entry)
 								            && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            /* We found the entry with the 'key->mf' miniflow */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            return current_entry->flow;
 								        }
 								    }
 								    return NULL;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dp_netdev_flow *
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
 								                          const struct netdev_flow_key *key,
 								                          int *lookup_num_p)
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dpcls_rule *rule;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
 								    struct dp_netdev_flow *netdev_flow = NULL;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    if (OVS_LIKELY(cls)) {
 								        dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
 								        netdev_flow = dp_netdev_flow_cast(rule);
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    return netdev_flow;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								}
 								static struct dp_netdev_flow *
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
 								                        const ovs_u128 *ufidp, const struct nlattr *key,
 								                        size_t key_len)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    struct flow flow;
 								    ovs_u128 ufid;
 								    /* If a UFID is not provided, determine one based on the key. */
 								    if (!ufidp && key && key_len
 								        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								        ufidp = &ufid;
 								    }
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    if (ufidp) {
 								        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								                                 &pmd->flow_table) {
-												util: Pass 128-bit arguments directly instead of using pointers.

Commit f2d105b5 (ofproto-dpif-xlate: xlate ct_{mark, label} correctly.)
introduced the ovs_u128_and() function.  It directly takes ovs_u128
values as arguments instead of pointers to them.  As this is a bit more
direct way to deal with 128-bit values, modify the other utility
functions to do the same.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-03 18:20:51 -07:00
+								            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								                return netdev_flow;
 								            }
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return NULL;
 								}
 								static void
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								                    struct dpif_flow_stats *stats)
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
+								{
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    struct dp_netdev_flow *netdev_flow;
 								    unsigned long long n;
 								    long long used;
 								    uint16_t flags;
 								    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
 								    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
 								    stats->n_packets = n;
 								    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
 								    stats->n_bytes = n;
 								    atomic_read_relaxed(&netdev_flow->stats.used, &used);
 								    stats->used = used;
 								    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
 								    stats->tcp_flags = flags;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
 								 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
 								 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
 								 * protect them. */
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								static void
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								                            struct dpif_flow *flow, bool terse)
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								{
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								    if (terse) {
 								        memset(flow, 0, sizeof *flow);
 								    } else {
 								        struct flow_wildcards wc;
 								        struct dp_netdev_actions *actions;
 								        size_t offset;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        struct odp_flow_key_parms odp_parms = {
 								            .flow = &netdev_flow->flow,
 								            .mask = &wc.masks,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								            .support = dp_netdev_support,
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        };
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								        /* in_port is exact matched, but we have left it out from the mask for
 								         * optimnization reasons. Add in_port back to the mask. */
 								        wc.masks.in_port.odp_port = ODPP_NONE;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Key */
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        offset = key_buf->size;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        flow->key = ofpbuf_tail(key_buf);
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_flow(&odp_parms, key_buf);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        flow->key_len = key_buf->size - offset;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Mask */
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        offset = mask_buf->size;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        flow->mask = ofpbuf_tail(mask_buf);
-												odp-util: Pass down flow netlink attributes when translating masks.

Sometimes we need to look at flow fields to understand how to parse
an attribute. However, masks don't have this information - just the
mask on the field. We already use the translated flow structure for
this purpose but this isn't always enough since sometimes we actually
need the raw netlink information. Fortunately, that is also readily
available so this passes it down from the appropriate callers.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-19 13:54:13 -07:00
+								        odp_parms.key_buf = key_buf;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_mask(&odp_parms, mask_buf);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        flow->mask_len = mask_buf->size - offset;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Actions */
 								        actions = dp_netdev_flow_get_actions(netdev_flow);
 								        flow->actions = actions->actions;
 								        flow->actions_len = actions->size;
 								    }
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    flow->ufid = netdev_flow->ufid;
 								    flow->ufid_present = true;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    flow->pmd_id = netdev_flow->pmd_id;
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								    get_dpif_flow_stats(netdev_flow, &flow->stats);
 								}
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								static int
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
 								                              const struct nlattr *mask_key,
 								                              uint32_t mask_key_len, const struct flow *flow,
-												dpif-netdev: Don't use metaflow to operate on userspace datapath fields.

If ofproto-dpif installs a flow into the userspace datapath that doesn't
include a mask, we need to synthesize an exact match one. This is currently
done using the metaflow infrastructure, iterating over each field and
setting it to all ones.

There is a conceptual mismatch here because metaflow is operating on
OpenFlow fields, not datapath ones. Even though they are generally very
similar, there are subtle differences, which is why it is necessary to
fix up the input port mask.

With Geneve options, the mapping is much more complicated and so the
situation is worse. The first issue is that the metaflow to flow
mapping can change over time, so we would need to do more revalidation
to track this. In addition, an upcoming patch will completely disconnect
the option format between ofproto-dpif and dpif-netdev, so the values
written by metaflow don't make sense at all.

When megaflows are turned off, ofproto-dpif internally generates masks
using flow_wildcards_init_for_packet(). Since that's the same as what
we want to do here, we can just use that instead of metaflow.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-07-16 09:05:33 -07:00
+								                              struct flow_wildcards *wc)
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								{
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
+								    enum odp_key_fitness fitness;
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
+								    if (fitness) {
 								        /* This should not happen: it indicates that
 								         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
 								         * disagree on the acceptable form of a mask.  Log the problem
 								         * as an error, with enough details to enable debugging. */
 								        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 								        if (!VLOG_DROP_ERR(&rl)) {
 								            struct ds s;
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
+								            ds_init(&s);
 								            odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
 								                            true);
 								            VLOG_ERR("internal error parsing flow mask %s (%s)",
 								                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
 								            ds_destroy(&s);
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								        }
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
 								        return EINVAL;
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								    }
 								    return 0;
 								}
 								static int
 								dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
 								                              struct flow *flow)
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								{
-												dpif-netdev: Make "packet-out" with in_port=OFPP_CONTROLLER work again.

Commit 4e022ec09e14 (Create specific types for ofp and odp port) broke
OpenFlow OFPP_PACKET_OUT requests that use in_port=OFPP_CONTROLLER.  This
commit fixes the problem and adds a regression test.

CC: Alex Wang <alexw@nicira.com>
Reported-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-09 09:23:02 -07:00
+								    odp_port_t in_port;
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    if (odp_flow_key_to_flow(key, key_len, flow)) {
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								        /* This should not happen: it indicates that odp_flow_key_from_flow()
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								         * and odp_flow_key_to_flow() disagree on the acceptable form of a
 								         * flow.  Log the problem as an error, with enough details to enable
 								         * debugging. */
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 								        if (!VLOG_DROP_ERR(&rl)) {
 								            struct ds s;
 								            ds_init(&s);
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
 								            ds_destroy(&s);
 								        }
 								        return EINVAL;
 								    }
-												dpif-netdev: Make "packet-out" with in_port=OFPP_CONTROLLER work again.

Commit 4e022ec09e14 (Create specific types for ofp and odp port) broke
OpenFlow OFPP_PACKET_OUT requests that use in_port=OFPP_CONTROLLER.  This
commit fixes the problem and adds a regression test.

CC: Alex Wang <alexw@nicira.com>
Reported-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-09 09:23:02 -07:00
+								    in_port = flow->in_port.odp_port;
 								    if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
-												datapath: Allow a packet with no input port to omit OVS_KEY_ATTR_IN_PORT.

When ovs-vswitchd executes actions on a synthesized packet, that is, on a
packet that is not being forwarded from any particular port but is being
generated by ovs-vswitchd itself or by an OpenFlow controller (using a
OFPT_PACKET_OUT message with an in_port of OFPP_NONE), there is no good
choice for the in_port to pass to the kernel in the flow in the
OVS_PACKET_CMD_EXECUTE message.  This commit allows ovs-vswitchd to omit
the in_port entirely in this case.

This fixes a bug in OFPT_PACKET_OUT: using an in_port of OFPP_NONE would
cause the packet to be dropped by the kernel, since that's an invalid
input port.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Reported-by: Aaron Rosen <arosen@clemson.edu>

											
										
										
											2011-09-08 16:30:20 -07:00
+								        return EINVAL;
 								    }
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
+								        return EINVAL;
 								    }
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
 								    struct hmapx_node *node;
 								    int error = EINVAL;
 								    if (get->pmd_id == PMD_ID_NULL) {
 								        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
 								                dp_netdev_pmd_unref(pmd);
 								            }
 								        }
 								    } else {
 								        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
 								        if (!pmd) {
 								            goto out;
 								        }
 								        hmapx_add(&to_find, pmd);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    }
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    if (!hmapx_count(&to_find)) {
 								        goto out;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    HMAPX_FOR_EACH (node, &to_find) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
 								                                              get->key_len);
 								        if (netdev_flow) {
 								            dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
 								                                        get->flow, false);
 								            error = 0;
 								            break;
 								        } else {
 								            error = ENOENT;
 								        }
 								    }
-												datapath: Change ODP_FLOW_GET to retrieve only a single flow at a time.

This brings the code closer to what the Netlink interface will need to
implement.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-17 14:40:58 -08:00
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    HMAPX_FOR_EACH (node, &to_find) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        dp_netdev_pmd_unref(pmd);
 								    }
 								out:
 								    hmapx_destroy(&to_find);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static struct dp_netdev_flow *
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
 								                   struct match *match, const ovs_u128 *ufid,
-												dpif-netdev: Avoid useless flow copy in dp_netdev_flow_add().

This patch gives dp_netdev_flow_add() a match with which it can
initialize the classifier rule.  This prevents it from needing to copy
a flow and flow_wildcards into the match first.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-27 18:56:45 -07:00
+								                   const struct nlattr *actions, size_t actions_len)
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    OVS_REQUIRES(pmd->flow_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dp_netdev_flow *flow;
 								    struct netdev_flow_key mask;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
 								    /* Make sure in_port is exact matched before we read it. */
 								    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    odp_port_t in_port = match->flow.in_port.odp_port;
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    /* As we select the dpcls based on the port number, each netdev flow
 								     * belonging to the same dpcls will have the same odp_port value.
 								     * For performance reasons we wildcard odp_port here in the mask.  In the
 								     * typical case dp_hash is also wildcarded, and the resulting 8-byte
 								     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
 								     * will not be part of the subtable mask.
 								     * This will speed up the hash computation during dpcls_lookup() because
 								     * there is one less call to hash_add64() in this case. */
 								    match->wc.masks.in_port.odp_port = 0;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_mask_init(&mask, match);
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    match->wc.masks.in_port.odp_port = ODPP_NONE;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Make sure wc does not have metadata. */
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
 								               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
-												dpif-netdev: Use ovsthread_stats for flow stats.

This should scale better than a single mutex, though still not
ideally.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-01-22 16:03:10 -08:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Do not allocate extra space. */
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    memset(&flow->stats, 0, sizeof flow->stats);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    flow->dead = false;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    flow->batch = NULL;
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    ovs_refcount_init(&flow->ref_cnt);
 								    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    /* Select dpcls for in_port. Relies on in_port to be exact match. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
 								    dpcls_insert(cls, &flow->cr, &mask);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Fix rare flow add race condition.

Before this patch, dp_netdev_flow_add() inserted newly minted flows in
the "flow_table" cmap before inserting them into the per core "dpcls"
classifier.  Since dpcls_insert() initializes 'flow->cr.mask', there's
a brief window where the flow is accessible from the cmap, but has a
bogus mask value.

In my testing, under rare instances (i.e. once every 20 minutes with a
very specific flow table and traffic pattern), revalidators core dump
when they call dpif_netdev_flow_dump_next(), which accesses this bogus
mask value from dp_netdev_flow_to_dpif_flow().

By inserting into the per core classifier before the cmap, all the
values are guaranteed to be initialized during flow dumps.  With this
patch, I can no longer reproduce the crash.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-01-03 11:39:14 -08:00
+								    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
 								                dp_netdev_flow_hash(&flow->ufid));
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
 								        struct ds ds = DS_EMPTY_INITIALIZER;
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        struct ofpbuf key_buf, mask_buf;
 								        struct odp_flow_key_parms odp_parms = {
 								            .flow = &match->flow,
 								            .mask = &match->wc.masks,
 								            .support = dp_netdev_support,
 								        };
 								        ofpbuf_init(&key_buf, 0);
 								        ofpbuf_init(&mask_buf, 0);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        odp_flow_key_from_flow(&odp_parms, &key_buf);
 								        odp_parms.key_buf = &key_buf;
 								        odp_flow_key_from_mask(&odp_parms, &mask_buf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_put_cstr(&ds, "flow_add: ");
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								        odp_format_ufid(ufid, &ds);
 								        ds_put_cstr(&ds, " ");
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        odp_flow_format(key_buf.data, key_buf.size,
 								                        mask_buf.data, mask_buf.size,
 								                        NULL, &ds, false);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_put_cstr(&ds, ", actions:");
 								        format_odp_actions(&ds, actions, actions_len);
 								        VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        ofpbuf_uninit(&key_buf);
 								        ofpbuf_uninit(&mask_buf);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_destroy(&ds);
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    return flow;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												dpif: Change provider interface to consistently use operation structs.

Until now, a "flow put" has represented its parameters in two different
ways, depending on whether it was coming from dpif_flow_put() or from
dpif_operate(), and similarly for an "execute" operation.  This commit
adopts the operation struct consistently within the dpif provider
interface, which seems cleaner.

This commit also factors out logging for flow puts and executes, which
is useful in the following commit.

This doesn't change the dpif client interface, since the two forms are
more convenient for clients than always filling out an operation struct.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2011-12-26 14:39:03 -08:00
+								dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct netdev_flow_key key;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Avoid useless flow copy in dp_netdev_flow_add().

This patch gives dp_netdev_flow_add() a match with which it can
initialize the classifier rule.  This prevents it from needing to copy
a flow and flow_wildcards into the match first.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-27 18:56:45 -07:00
+								    struct match match;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    ovs_u128 ufid;
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    unsigned pmd_id = put->pmd_id == PMD_ID_NULL
 								                      ? NON_PMD_CORE_ID : put->pmd_id;
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								    int error;
-												dpif-netdev: Avoid useless flow copy in dp_netdev_flow_add().

This patch gives dp_netdev_flow_add() a match with which it can
initialize the classifier rule.  This prevents it from needing to copy
a flow and flow_wildcards into the match first.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-27 18:56:45 -07:00
+								    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								    if (error) {
 								        return error;
 								    }
 								    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
 								                                          put->mask, put->mask_len,
-												dpif-netdev: Don't use metaflow to operate on userspace datapath fields.

If ofproto-dpif installs a flow into the userspace datapath that doesn't
include a mask, we need to synthesize an exact match one. This is currently
done using the metaflow infrastructure, iterating over each field and
setting it to all ones.

There is a conceptual mismatch here because metaflow is operating on
OpenFlow fields, not datapath ones. Even though they are generally very
similar, there are subtle differences, which is why it is necessary to
fix up the input port mask.

With Geneve options, the mapping is much more complicated and so the
situation is worse. The first issue is that the metaflow to flow
mapping can change over time, so we would need to do more revalidation
to track this. In addition, an upcoming patch will completely disconnect
the option format between ofproto-dpif and dpif-netdev, so the values
written by metaflow don't make sense at all.

When megaflows are turned off, ofproto-dpif internally generates masks
using flow_wildcards_init_for_packet(). Since that's the same as what
we want to do here, we can just use that instead of metaflow.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-07-16 09:05:33 -07:00
+								                                          &match.flow, &match.wc);
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								    if (error) {
 								        return error;
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    pmd = dp_netdev_get_pmd(dp, pmd_id);
 								    if (!pmd) {
 								        return EINVAL;
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Must produce a netdev_flow_key for lookup.
 								     * This interface is no longer performance critical, since it is not used
 								     * for upcall processing any more. */
 								    netdev_flow_key_from_flow(&key, &match.flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    if (put->ufid) {
 								        ufid = *put->ufid;
 								    } else {
 								        dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_lock(&pmd->flow_mutex);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key, NULL);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    if (!netdev_flow) {
-												dpif: Change provider interface to consistently use operation structs.

Until now, a "flow put" has represented its parameters in two different
ways, depending on whether it was coming from dpif_flow_put() or from
dpif_operate(), and similarly for an "execute" operation.  This commit
adopts the operation struct consistently within the dpif provider
interface, which seems cleaner.

This commit also factors out logging for flow puts and executes, which
is useful in the following commit.

This doesn't change the dpif client interface, since the two forms are
more convenient for clients than always filling out an operation struct.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2011-12-26 14:39:03 -08:00
+								        if (put->flags & DPIF_FP_CREATE) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								            if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
-												dpif: Change provider interface to consistently use operation structs.

Until now, a "flow put" has represented its parameters in two different
ways, depending on whether it was coming from dpif_flow_put() or from
dpif_operate(), and similarly for an "execute" operation.  This commit
adopts the operation struct consistently within the dpif provider
interface, which seems cleaner.

This commit also factors out logging for flow puts and executes, which
is useful in the following commit.

This doesn't change the dpif client interface, since the two forms are
more convenient for clients than always filling out an operation struct.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2011-12-26 14:39:03 -08:00
+								                if (put->stats) {
 								                    memset(put->stats, 0, sizeof *put->stats);
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
+								                }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								                dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								                                   put->actions_len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                error = 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								                error = EFBIG;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            }
 								        } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								            error = ENOENT;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    } else {
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								        if (put->flags & DPIF_FP_MODIFY
-												dpif-netdev: Avoid useless flow copy in dp_netdev_flow_add().

This patch gives dp_netdev_flow_add() a match with which it can
initialize the classifier rule.  This prevents it from needing to copy
a flow and flow_wildcards into the match first.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-27 18:56:45 -07:00
+								            && flow_equal(&match.flow, &netdev_flow->flow)) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								            struct dp_netdev_actions *new_actions;
 								            struct dp_netdev_actions *old_actions;
 								            new_actions = dp_netdev_actions_create(put->actions,
 								                                                   put->actions_len);
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								            old_actions = dp_netdev_flow_get_actions(netdev_flow);
 								            ovsrcu_set(&netdev_flow->actions, new_actions);
-												dpif-netdev: Use ovsthread_stats for flow stats.

This should scale better than a single mutex, though still not
ideally.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-01-22 16:03:10 -08:00
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								            if (put->stats) {
 								                get_dpif_flow_stats(netdev_flow, put->stats);
 								            }
 								            if (put->flags & DPIF_FP_ZERO_STATS) {
-												dpif-netdev: Remove support for DPIF_FP_ZERO_STATS flag

Since flow statistics are thread local and updated without any lock,
it is not correct to do a memset from another thread.

This commit simply removes the support for the flag.  It is not needed
by ofproto-dpif, it is only exposed by dpctl commands.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-01 17:20:57 +01:00
+								                /* XXX: The userspace datapath uses thread local statistics
 								                 * (for flows), which should be updated only by the owning
 								                 * thread.  Since we cannot write on stats memory here,
 								                 * we choose not to support this flag.  Please note:
 								                 * - This feature is currently used only by dpctl commands with
 								                 *   option --clear.
 								                 * - Should the need arise, this operation can be implemented
 								                 *   by keeping a base value (to be update here) for each
 								                 *   counter, and subtracting it before outputting the stats */
 								                error = EOPNOTSUPP;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								        } else if (put->flags & DPIF_FP_CREATE) {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								            error = EEXIST;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								        } else {
 								            /* Overlapping flow. */
 								            error = EINVAL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
 								    dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												ofproto-dpif: Batch flow uninstallations due to expiration.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-17 21:52:10 -07:00
+								dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    unsigned pmd_id = del->pmd_id == PMD_ID_NULL
 								                      ? NON_PMD_CORE_ID : del->pmd_id;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    int error = 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    pmd = dp_netdev_get_pmd(dp, pmd_id);
 								    if (!pmd) {
 								        return EINVAL;
 								    }
 								    ovs_mutex_lock(&pmd->flow_mutex);
 								    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
 								                                          del->key_len);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    if (netdev_flow) {
-												ofproto-dpif: Batch flow uninstallations due to expiration.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-04-17 21:52:10 -07:00
+								        if (del->stats) {
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								            get_dpif_flow_stats(netdev_flow, del->stats);
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
+								        }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = ENOENT;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
 								    dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								struct dpif_netdev_flow_dump {
 								    struct dpif_flow_dump up;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct cmap_position poll_thread_pos;
 								    struct cmap_position flow_pos;
 								    struct dp_netdev_pmd_thread *cur_pmd;
-												dpif: Make dpif_flow_dump_next() thread-safe.

This patch makes it the caller's responsibility to initialize a
per-thread 'state' object and pass it down to the dpif_flow_dump_next()
implementation. The implementation can expect to be called from multiple
threads with the same 'iter' and different 'state' objects.

When flow_dump_next() returns non-zero, the implementation must ensure
that subsequent calls with the same arguments also return non-zero.
Subsequent calls with the same 'iter' and different 'state' may return
zero, but should make progress towards returning non-zero.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:08 -08:00
+								    int status;
 								    struct ovs_mutex mutex;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								};
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								static struct dpif_netdev_flow_dump *
 								dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								static struct dpif_flow_dump *
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    struct dpif_netdev_flow_dump *dump;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dump = xzalloc(sizeof *dump);
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    dpif_flow_dump_init(&dump->up, dpif_);
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								    dump->up.terse = terse;
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_init(&dump->mutex);
 								    return &dump->up;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								}
 								static int
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_destroy(&dump->mutex);
 								    free(dump);
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								    return 0;
 								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								struct dpif_netdev_flow_dump_thread {
 								    struct dpif_flow_dump_thread up;
 								    struct dpif_netdev_flow_dump *dump;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
 								    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								};
 								static struct dpif_netdev_flow_dump_thread *
 								dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
 								{
 								    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
 								}
 								static struct dpif_flow_dump_thread *
 								dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
 								{
 								    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
 								    struct dpif_netdev_flow_dump_thread *thread;
 								    thread = xmalloc(sizeof *thread);
 								    dpif_flow_dump_thread_init(&thread->up, &dump->up);
 								    thread->dump = dump;
 								    return &thread->up;
 								}
 								static void
 								dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
 								{
 								    struct dpif_netdev_flow_dump_thread *thread
 								        = dpif_netdev_flow_dump_thread_cast(thread_);
 								    free(thread);
 								}
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								static int
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								                           struct dpif_flow *flows, int max_flows)
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								{
 								    struct dpif_netdev_flow_dump_thread *thread
 								        = dpif_netdev_flow_dump_thread_cast(thread_);
 								    struct dpif_netdev_flow_dump *dump = thread->dump;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
 								    int n_flows = 0;
 								    int i;
-												flow: Separate "flow_t" from "struct odp_flow_key".

The "struct odp_flow_key" used in the kernel datapath is conceptually
separate from the "flow_t" used in userspace, but until now we have
used the latter as a typedef for the former for convenience.  This commit
separates them.  This makes it possible in upcoming commits to change
them independently.

This is cross-ported from the "wdp" branch, which has had it for months.

											
										
										
											2010-10-11 13:31:35 -07:00
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_lock(&dump->mutex);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    if (!dump->status) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
 								        struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
 								        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
 								        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
 								        /* First call to dump_next(), extracts the first pmd thread.
 								         * If there is no pmd thread, returns immediately. */
 								        if (!pmd) {
 								            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
 								            if (!pmd) {
 								                ovs_mutex_unlock(&dump->mutex);
 								                return n_flows;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
 								            }
-												dpif: Make dpif_flow_dump_next() thread-safe.

This patch makes it the caller's responsibility to initialize a
per-thread 'state' object and pass it down to the dpif_flow_dump_next()
implementation. The implementation can expect to be called from multiple
threads with the same 'iter' and different 'state' objects.

When flow_dump_next() returns non-zero, the implementation must ensure
that subsequent calls with the same arguments also return non-zero.
Subsequent calls with the same 'iter' and different 'state' may return
zero, but should make progress towards returning non-zero.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:08 -08:00
+								        }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
 								        do {
 								            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
 								                struct cmap_node *node;
 								                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
 								                if (!node) {
 								                    break;
 								                }
 								                netdev_flows[n_flows] = CONTAINER_OF(node,
 								                                                     struct dp_netdev_flow,
 								                                                     node);
 								            }
 								            /* When finishing dumping the current pmd thread, moves to
 								             * the next. */
 								            if (n_flows < flow_limit) {
 								                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
 								                dp_netdev_pmd_unref(pmd);
 								                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
 								                if (!pmd) {
 								                    dump->status = EOF;
 								                    break;
 								                }
 								            }
 								            /* Keeps the reference to next caller. */
 								            dump->cur_pmd = pmd;
 								            /* If the current dump is empty, do not exit the loop, since the
 								             * remaining pmds could have flows to be dumped.  Just dumps again
 								             * on the new 'pmd'. */
 								        } while (!n_flows);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    }
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_unlock(&dump->mutex);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    for (i = 0; i < n_flows; i++) {
 								        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
 								        struct odputil_keybuf *keybuf = &thread->keybuf[i];
 								        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
 								        struct dpif_flow *f = &flows[i];
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								        struct ofpbuf key, mask;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
 								        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
 								                                    dump->up.terse);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    }
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    return n_flows;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												dpif: Use explicit packet metadata.

This helps reduce confusion about when a flow is a flow and when it is
just metadata.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    OVS_NO_THREAD_SAFETY_ANALYSIS
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch pp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
 								        dp_packet_size(execute->packet) > UINT16_MAX) {
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        return EINVAL;
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Tries finding the 'pmd'.  If NULL is returned, that means
 								     * the current thread is a non-pmd thread and should use
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    pmd = ovsthread_getspecific(dp->per_pmd_key);
 								    if (!pmd) {
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        if (!pmd) {
 								            return EBUSY;
 								        }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
 								    /* If the current thread is non-pmd thread, acquires
 								     * the 'non_pmd_mutex'. */
 								    if (pmd->core_id == NON_PMD_CORE_ID) {
 								        ovs_mutex_lock(&dp->non_pmd_mutex);
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: Initialize packet RSS hash in dpif_netdev_execute().

The datapath code expects the RSS hash to always be initialized.  This
is enforced by checking in emc_processing() that the hash is valid, and
eventually by computing a new one.

Unfortunately, there is another entry point to the datapath,
dpif_netdev_execute().  A packet generated by OVS (BFD frame,
packet-out from controller) doesn't have a valid RSS hash and so is
allowed to enter the datapath with an uninitialized hash value.

This commit recomputes the hash (if not valid) in dpif_netdev_execute().

The only place where we would use an invalid hash is netdev-vport, in
push_udp_header().  This caused an uninitialized memory read, and a
random value to be assigned to the outer tunnel header source port.

Reported-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: William Tu <u9012063@gmail.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-05-17 18:38:20 -07:00
+								    /* The action processing expects the RSS hash to be valid, because
 								     * it's always initialized at the beginning of datapath processing.
 								     * In this case, though, 'execute->packet' may not have gone through
 								     * the datapath at all, it may have been generated by the upper layer
 								     * (OpenFlow packet-out, BFD frame, ...). */
 								    if (!dp_packet_rss_valid(execute->packet)) {
 								        dp_packet_set_rss_hash(execute->packet,
 								                               flow_hash_5tuple(execute->flow, 0));
 								    }
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    packet_batch_init_packet(&pp, execute->packet);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
 								                              execute->actions, execute->actions_len,
 								                              time_msec());
-												dpif-netdev: Initialize packet RSS hash in dpif_netdev_execute().

The datapath code expects the RSS hash to always be initialized.  This
is enforced by checking in emc_processing() that the hash is valid, and
eventually by computing a new one.

Unfortunately, there is another entry point to the datapath,
dpif_netdev_execute().  A packet generated by OVS (BFD frame,
packet-out from controller) doesn't have a valid RSS hash and so is
allowed to enter the datapath with an uninitialized hash value.

This commit recomputes the hash (if not valid) in dpif_netdev_execute().

The only place where we would use an invalid hash is netdev-vport, in
push_udp_header().  This caused an uninitialized memory read, and a
random value to be assigned to the outer tunnel header source port.

Reported-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: William Tu <u9012063@gmail.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-05-17 18:38:20 -07:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
 								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								        dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif: Use explicit packet metadata.

This helps reduce confusion about when a flow is a flow and when it is
just metadata.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    return 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								static void
 								dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
 								{
 								    size_t i;
 								    for (i = 0; i < n_ops; i++) {
 								        struct dpif_op *op = ops[i];
 								        switch (op->type) {
 								        case DPIF_OP_FLOW_PUT:
 								            op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
 								            break;
 								        case DPIF_OP_FLOW_DEL:
 								            op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
 								            break;
 								        case DPIF_OP_EXECUTE:
 								            op->error = dpif_netdev_execute(dpif, &op->u.execute);
 								            break;
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
 								        case DPIF_OP_FLOW_GET:
 								            op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
 								            break;
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								        }
 								    }
 								}
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								/* Changes the number or the affinity of pmd threads.  The changes are actually
 								 * applied in dpif_netdev_run(). */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								static int
-												dpif-netdev: Allow different numbers of rx queues for different ports.

Currently, all of the PMD netdevs can only have the same number of
rx queues, which is specified in other_config:n-dpdk-rxqs.

Fix that by introducing of new option for PMD interfaces: 'n_rxq', which
specifies the maximum number of rx queues to be created for this
interface.

Example:
	ovs-vsctl set Interface dpdk0 options:n_rxq=8

Old 'other_config:n-dpdk-rxqs' deleted.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-21 17:15:18 +03:00
+								dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
 								        free(dp->pmd_cmask);
 								        dp->pmd_cmask = nullable_xstrdup(cmask);
 								        dp_netdev_request_reconfigure(dp);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    }
 								    return 0;
 								}
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Parses affinity list and returns result in 'core_ids'. */
 								static int
 								parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
 								{
 								    unsigned i;
 								    char *list, *copy, *key, *value;
 								    int error = 0;
 								    for (i = 0; i < n_rxq; i++) {
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								        core_ids[i] = OVS_CORE_UNSPEC;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    }
 								    if (!affinity_list) {
 								        return 0;
 								    }
 								    list = copy = xstrdup(affinity_list);
 								    while (ofputil_parse_key_value(&list, &key, &value)) {
 								        int rxq_id, core_id;
 								        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
 								            || !str_to_int(value, 0, &core_id) || core_id < 0) {
 								            error = EINVAL;
 								            break;
 								        }
 								        if (rxq_id < n_rxq) {
 								            core_ids[rxq_id] = core_id;
 								        }
 								    }
 								    free(copy);
 								    return error;
 								}
 								/* Parses 'affinity_list' and applies configuration if it is valid. */
 								static int
 								dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
 								                                  const char *affinity_list)
 								{
 								    unsigned *core_ids, i;
 								    int error = 0;
 								    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
 								    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
 								        error = EINVAL;
 								        goto exit;
 								    }
 								    for (i = 0; i < port->n_rxq; i++) {
 								        port->rxqs[i].core_id = core_ids[i];
 								    }
 								exit:
 								    free(core_ids);
 								    return error;
 								}
 								/* Changes the affinity of port's rx queues.  The changes are actually applied
 								 * in dpif_netdev_run(). */
 								static int
 								dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
 								                            const struct smap *cfg)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error = 0;
 								    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
 								    ovs_mutex_lock(&dp->port_mutex);
 								    error = get_port_by_number(dp, port_no, &port);
 								    if (error || !netdev_is_pmd(port->netdev)
 								        || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
 								        goto unlock;
 								    }
 								    error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
 								    if (error) {
 								        goto unlock;
 								    }
 								    free(port->rxq_affinity_list);
 								    port->rxq_affinity_list = nullable_xstrdup(affinity_list);
 								    dp_netdev_request_reconfigure(dp);
 								unlock:
 								    ovs_mutex_unlock(&dp->port_mutex);
 								    return error;
 								}
-												dpif-netdev: Allow enqueue actions.

The dpif-netdev implementation disallowed enqueue actions because
it did not support conversion from OVS 'queue_id' to dpif
'priority'.  For testing purposes, this patch allows queues which
translate into NOOPs.

											
										
										
											2011-11-21 13:36:17 -08:00
+								static int
 								dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
 								                              uint32_t queue_id, uint32_t *priority)
 								{
 								    *priority = queue_id;
 								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
 								 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								struct dp_netdev_actions *
 								dp_netdev_actions_create(const struct nlattr *actions, size_t size)
 								{
 								    struct dp_netdev_actions *netdev_actions;
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								    netdev_actions = xmalloc(sizeof *netdev_actions + size);
 								    memcpy(netdev_actions->actions, actions, size);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								    netdev_actions->size = size;
 								    return netdev_actions;
 								}
 								struct dp_netdev_actions *
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								{
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								}
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								static void
 								dp_netdev_actions_free(struct dp_netdev_actions *actions)
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								{
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    free(actions);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								}
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								static inline unsigned long long
 								cycles_counter(void)
 								{
 								#ifdef DPDK_NETDEV
 								    return rte_get_tsc_cycles();
 								#else
 								    return 0;
 								#endif
 								}
 								/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
 								extern struct ovs_mutex cycles_counter_fake_mutex;
 								/* Start counting cycles.  Must be followed by 'cycles_count_end()' */
 								static inline void
 								cycles_count_start(struct dp_netdev_pmd_thread *pmd)
 								    OVS_ACQUIRES(&cycles_counter_fake_mutex)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    pmd->last_cycles = cycles_counter();
 								}
 								/* Stop counting cycles and add them to the counter 'type' */
 								static inline void
 								cycles_count_end(struct dp_netdev_pmd_thread *pmd,
 								                 enum pmd_cycles_counter_type type)
 								    OVS_RELEASES(&cycles_counter_fake_mutex)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    unsigned long long interval = cycles_counter() - pmd->last_cycles;
 								    non_atomic_ullong_add(&pmd->cycles.n[type], interval);
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												sparse: workaround for a bug in sparse.

sparse emits the following warning:
lib/dpif-netdev.c:1755:15: warning: Initializer entry defined twice
lib/dpif-netdev.c:1755:15:   also defined here
due to a bug in sparse which doesn't like inlined functions which
expands a #define within it. This commit removes inline to make
sparse happy.

Signed-off-by: Pritesh Kothari <pritesh.kothari@cisco.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-28 12:20:00 -07:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								                           struct dp_netdev_port *port,
 								                           struct netdev_rxq *rxq)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch batch;
 								    int error;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_packet_batch_init(&batch);
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								    cycles_count_start(pmd);
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    error = netdev_rxq_recv(rxq, &batch);
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								    cycles_count_end(pmd, PMD_CYCLES_POLLING);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    if (!error) {
-												lib/dpif-netdev: Make emc_mutex recursive.

dpif_netdev_execute may be called while doing upcall processing.
Since the context of the input port is not tracked upto this point, we
use the shared dp->emc_cache for packet execution, where the emc_cache
is needed for recirculation.

While recursive mutexes can make thread safety analysis hard, for now
we change emc_mutex to be recursive.  Forthcoming new unit tests will
fail with the current non-recursive mutex.  Later improvements may
remove the need for this recursion.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>
											
										
										
											2014-09-08 15:33:00 -07:00
+								        *recirc_depth_get() = 0;
-												netdev-dpif: Add metadata to dpif-packet.

Today dpif-netdev has single metadat for given batch, since one
batch belongs to one port, but soon packets fro single tunnel ports
can belong to different ports, so we need to have per packet metadata.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 20:23:58 -07:00
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								        cycles_count_start(pmd);
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								        dp_netdev_input(pmd, &batch, port->port_no);
-												dpif-netdev: Add simple per pmd-thread cycles counters.

The counters use x86 TSC if available (currently only with DPDK). They
will be exposed by subsequents commits

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:48 +01:00
+								        cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    } else if (error != EAGAIN && error != EOPNOTSUPP) {
-												lib/dpif-netdev: Make emc_mutex recursive.

dpif_netdev_execute may be called while doing upcall processing.
Since the context of the input port is not tracked upto this point, we
use the shared dp->emc_cache for packet execution, where the emc_cache
is needed for recirculation.

While recursive mutexes can make thread safety analysis hard, for now
we change emc_mutex to be recursive.  Forthcoming new unit tests will
fail with the current non-recursive mutex.  Later improvements may
remove the need for this recursion.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>
											
										
										
											2014-09-08 15:33:00 -07:00
+								        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
 								        VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
-												lib/dpif-netdev: Make emc_mutex recursive.

dpif_netdev_execute may be called while doing upcall processing.
Since the context of the input port is not tracked upto this point, we
use the shared dp->emc_cache for packet execution, where the emc_cache
is needed for recirculation.

While recursive mutexes can make thread safety analysis hard, for now
we change emc_mutex to be recursive.  Forthcoming new unit tests will
fail with the current non-recursive mutex.  Later improvements may
remove the need for this recursion.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>
											
										
										
											2014-09-08 15:33:00 -07:00
+								                    netdev_get_name(port->netdev), ovs_strerror(error));
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
 								}
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								static int
 								port_reconfigure(struct dp_netdev_port *port)
 								{
 								    struct netdev *netdev = port->netdev;
 								    int i, err;
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								    if (!netdev_is_reconf_required(netdev)) {
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        return 0;
 								    }
 								    /* Closes the existing 'rxq's. */
 								    for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        netdev_rxq_close(port->rxqs[i].rxq);
 								        port->rxqs[i].rxq = NULL;
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    }
 								    port->n_rxq = 0;
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								    /* Allows 'netdev' to apply the pending configuration changes. */
 								    err = netdev_reconfigure(netdev);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    if (err && (err != EOPNOTSUPP)) {
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								        VLOG_ERR("Failed to set interface %s new configuration",
 								                 netdev_get_name(netdev));
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        return err;
 								    }
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    port->rxqs = xrealloc(port->rxqs,
 								                          sizeof *port->rxqs * netdev_n_rxq(netdev));
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Realloc 'used' counters for tx queues. */
 								    free(port->txq_used);
 								    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    for (i = 0; i < netdev_n_rxq(netdev); i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        err = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        if (err) {
 								            return err;
 								        }
 								        port->n_rxq++;
 								    }
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    /* Parse affinity list to apply configuration for new queues. */
 								    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    return 0;
 								}
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								static void
 								reconfigure_pmd_threads(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    struct dp_netdev_port *port, *next;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    int n_cores;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								    dp_netdev_destroy_all_pmds(dp);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Reconfigures the cpu mask. */
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    ovs_numa_set_cpu_mask(dp->pmd_cmask);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
 								    n_cores = ovs_numa_get_n_cores();
 								    if (n_cores == OVS_CORE_UNSPEC) {
 								        VLOG_ERR("Cannot get cpu core info");
 								        return;
 								    }
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
 								        int err;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        err = port_reconfigure(port);
 								        if (err) {
 								            hmap_remove(&dp->ports, &port->node);
 								            seq_change(dp->port_seq);
 								            port_destroy(port);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        } else {
 								            port->dynamic_txqs = netdev_n_txq(port->netdev) < n_cores + 1;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								        }
 								    }
 								    /* Restores the non-pmd. */
 								    dp_netdev_set_nonpmd(dp);
 								    /* Restores all pmd threads. */
 								    dp_netdev_reset_pmd_threads(dp);
 								}
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
 								static bool
 								ports_require_restart(const struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (netdev_is_reconf_required(port->netdev)) {
 								            return true;
 								        }
 								    }
 								    return false;
 								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								/* Return true if needs to revalidate datapath flows. */
 								static bool
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								dpif_netdev_run(struct dpif *dpif)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								    struct dp_netdev_pmd_thread *non_pmd;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    uint64_t new_tnl_seq;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
 								    if (non_pmd) {
 								        ovs_mutex_lock(&dp->non_pmd_mutex);
 								        HMAP_FOR_EACH (port, node, &dp->ports) {
 								            if (!netdev_is_pmd(port->netdev)) {
 								                int i;
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								                for (i = 0; i < port->n_rxq; i++) {
 								                    dp_netdev_process_rxq_port(non_pmd, port,
 								                                               port->rxqs[i].rxq);
 								                }
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								            }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
 								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        dp_netdev_pmd_unref(non_pmd);
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								        reconfigure_pmd_threads(dp);
 								    }
 								    ovs_mutex_unlock(&dp->port_mutex);
-												tnl-arp-cache: Rename module and functions to tnl-neigh-cache.

Since we don't distinguish between IPv4 and IPv6 lookups, consolidate ARP
and ND cache into neighbor cache. Other references to ARP related to the
ARP cache but that are not really about ARP have been renamed as well.
tnl_arp_lookup is kept for lookups using IPv4 instead of IPv4-mapped
addresses, but that is going to be removed in a later patch.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-11-30 16:24:49 -02:00
+								    tnl_neigh_cache_run();
-												tnl-ports: Add destination IP and MAC address to the match.

Currently tnl-port table wildcard destination ip and mac addresses
for given tunnel packet.  That could result accepting tunnel
packets destined for other hosts.  Following patch adds
support for matching for ip and mac address.
IP address upates to tnl-port table are piggybacked on
ovs-router updates.

Reported-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-09-03 00:42:34 -07:00
+								    tnl_port_map_run();
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    new_tnl_seq = seq_read(tnl_conf_seq);
 								    if (dp->last_tnl_conf_seq != new_tnl_seq) {
 								        dp->last_tnl_conf_seq = new_tnl_seq;
 								        return true;
 								    }
 								    return false;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
 								static void
 								dpif_netdev_wait(struct dpif *dpif)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								        netdev_wait_reconf_required(port->netdev);
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								        if (!netdev_is_pmd(port->netdev)) {
 								            int i;
-												dpif-netdev: Keep count of elements in port->rxq[].

This will ease deleting a port with no open rxqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 14:25:33 -08:00
+								            for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								                netdev_rxq_wait(port->rxqs[i].rxq);
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								            }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
 								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void
 								pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								{
 								    struct tx_port *tx_port_cached;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Free all used tx queue ids. */
 								    dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
 								        free(tx_port_cached);
 								    }
 								    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        free(tx_port_cached);
 								    }
 								}
 								/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
 								 * 'pmd->port_cache' (thread local) */
 								static void
 								pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								    OVS_REQUIRES(pmd->port_mutex)
 								{
 								    struct tx_port *tx_port, *tx_port_cached;
 								    pmd_free_cached_ports(pmd);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_shrink(&pmd->send_port_cache);
 								    hmap_shrink(&pmd->tnl_port_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
 								            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
 								            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
 								                        hash_port_no(tx_port_cached->port->port_no));
 								        }
 								        if (netdev_n_txq(tx_port->port->netdev)) {
 								            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
 								            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
 								                        hash_port_no(tx_port_cached->port->port_no));
 								        }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    }
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								static int
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
 								                          struct rxq_poll **ppoll_list)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												netdev: Rename netdev_rx to netdev_rxq

Preparation for multi queue netdev IO.  There are no functional changes
in this patch.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 19:38:14 -07:00
+								    struct rxq_poll *poll_list = *ppoll_list;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    struct rxq_poll *poll;
 								    int i;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
-												dpif-netdev: Introduce port_try_ref() to prevent a race.

When pmd thread interates through all ports for queue loading,
the main thread may unreference and 'rcu-free' a port before
pmd thread take new reference of it.  This could cause pmd
thread fail the reference and access freed memory later.

This commit fixes this race by introducing port_try_ref()
which uses ovs_refcount_try_ref_rcu().  And the pmd thread
will only load the port's queue, if port_try_ref() returns
true.

Found by inspection.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-21 15:54:07 -07:00
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    i = 0;
 								    LIST_FOR_EACH (poll, node, &pmd->poll_list) {
 								        poll_list[i++] = *poll;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    pmd_load_cached_ports(pmd);
 								    ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
 								    *ppoll_list = poll_list;
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    return i;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								static void *
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								pmd_thread_main(void *f_)
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								{
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd = f_;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    unsigned int lc = 0;
-												netdev: Rename netdev_rx to netdev_rxq

Preparation for multi queue netdev IO.  There are no functional changes
in this patch.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 19:38:14 -07:00
+								    struct rxq_poll *poll_list;
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
+								    unsigned int port_seq = PMD_INITIAL_SEQ;
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    bool exiting;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    int poll_cnt;
 								    int i;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    poll_list = NULL;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
 								    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
-												ovs-numa: Introduce function to set current thread affinity.

This commit moves the code that sets the pmd threads affinity from
netdev-dpdk to ovs-numa.  There's one small part left in netdev-dpdk, to
set the lcore_id.

Now dpif-netdev will call both modules (ovs-numa and netdev-dpdk) when
starting a pmd thread.

This change will allow having a dummy implementation of the set affinity
call, for testing purposes.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-06-06 17:05:49 -07:00
+								    ovs_numa_thread_setaffinity_core(pmd->core_id);
 								    dpdk_set_lcore_id(pmd->core_id);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								reload:
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    emc_cache_init(&pmd->flow_cache);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: log port/core affinity

When using multiple PMDs and numerous ports, a performance gain
may be achieved in some use cases by pinning a PMD/port to a
particular (set of) core(s).

This patch provides a summary of the switch's port/core affinities
each time that the status of the switch's ports is modified.
Based on this information, a user may determine what affinity
modifications are required to optimise performance for their
particular use case.

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Wojciech Andralojc <wojciechx.andralojc@intel.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-06-09 07:49:18 -07:00
+								    /* List port/core affinity */
 								    for (i = 0; i < poll_cnt; i++) {
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
 								                pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
 								                netdev_rxq_get_queue_id(poll_list[i].rx));
-												dpif-netdev: log port/core affinity

When using multiple PMDs and numerous ports, a performance gain
may be achieved in some use cases by pinning a PMD/port to a
particular (set of) core(s).

This patch provides a summary of the switch's port/core affinities
each time that the status of the switch's ports is modified.
Based on this information, a user may determine what affinity
modifications are required to optimise performance for their
particular use case.

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Wojciech Andralojc <wojciechx.andralojc@intel.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-06-09 07:49:18 -07:00
+								    }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    for (;;) {
 								        for (i = 0; i < poll_cnt; i++) {
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								            dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
 								        if (lc++ > 1024) {
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
+								            unsigned int seq;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								            lc = 0;
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
-												coverage: Add coverage_try_clear() for performance-critical threads.

For performance-critical threads like pmd threads, we currently make them
never call coverage_clear() to avoid contention over the global mutex
'coverage_mutex'.  So, even though pmd thread still keeps updating their
thread-local coverage count, the count is never attributed to the global
total.  But it is useful to have them available.

This commit makes this happen by implementing a non-contending version
of the clear function, coverage_try_clear().  The function will use
the ovs_mutex_trylock() and return immediately if the mutex cannot
be acquired.  Since threads like pmd thread are always busy-looping,
the lock will eventually be acquired.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com

											
										
										
											2015-08-13 11:31:49 -07:00
+								            coverage_try_clear();
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								            dp_netdev_pmd_try_optimize(pmd);
-												dpif-netdev: Remove PMD latency on seq_mutex

The PMD thread needs to keep processing RX queues in order
to achieve maximum throughput. It also needs to sweep emc
cache and quiesce which use seq_mutex. That mutex can
eventually block the PMD thread causing latency spikes and
affecting the throughput.

Since there is no requirement for running those tasks at a
specific time, this patch extend seq API to allow tentative
locking instead.

Reported-by: Karl Rister <krister@redhat.com>
Co-authored-by: Karl Rister <krister@redhat.com>
Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-05 10:33:38 -03:00
+								            if (!ovsrcu_try_quiesce()) {
 								                emc_cache_slow_sweep(&pmd->flow_cache);
 								            }
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								            atomic_read_relaxed(&pmd->change_seq, &seq);
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
+								            if (seq != port_seq) {
 								                port_seq = seq;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								                break;
 								            }
 								        }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    exiting = latch_is_set(&pmd->exit_latch);
 								    /* Signal here to make sure the pmd finishes
 								     * reloading the updated configuration. */
 								    dp_netdev_pmd_reload_done(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    emc_cache_uninit(&pmd->flow_cache);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    if (!exiting) {
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        goto reload;
 								    }
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    free(poll_list);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    pmd_free_cached_ports(pmd);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    return NULL;
 								}
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void
 								dp_netdev_disable_upcall(struct dp_netdev *dp)
 								    OVS_ACQUIRES(dp->upcall_rwlock)
 								{
 								    fat_rwlock_wrlock(&dp->upcall_rwlock);
 								}
 								static void
 								dpif_netdev_disable_upcall(struct dpif *dpif)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp_netdev_disable_upcall(dp);
 								}
 								static void
 								dp_netdev_enable_upcall(struct dp_netdev *dp)
 								    OVS_RELEASES(dp->upcall_rwlock)
 								{
 								    fat_rwlock_unlock(&dp->upcall_rwlock);
 								}
 								static void
 								dpif_netdev_enable_upcall(struct dpif *dpif)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp_netdev_enable_upcall(dp);
 								}
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static void
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
 								{
 								    ovs_mutex_lock(&pmd->cond_mutex);
 								    xpthread_cond_signal(&pmd->cond);
 								    ovs_mutex_unlock(&pmd->cond_mutex);
 								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
 								 * 'core_id' is NON_PMD_CORE_ID).
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 *
 								 * Caller must unrefs the returned reference.  */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static struct dp_netdev_pmd_thread *
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    struct dp_netdev_pmd_thread *pmd;
-												lib/cmap: split up cmap_find().

This makes the following patch easier and cleans up the code.

Explicit "inline" keywords seem necessary to prevent performance
regression on cmap_find() with GCC 4.7 -O2.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-09-24 10:39:20 -07:00
+								    const struct cmap_node *pnode;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								    pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    if (!pnode) {
 								        return NULL;
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
 								static void
 								dp_netdev_set_nonpmd(struct dp_netdev *dp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								{
 								    struct dp_netdev_pmd_thread *non_pmd;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct dp_netdev_port *port;
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
 								    non_pmd = xzalloc(sizeof *non_pmd);
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        dp_netdev_add_port_tx_to_pmd(non_pmd, port);
 								    }
 								    dp_netdev_reload_pmd__(non_pmd);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Caller must have valid pointer to 'pmd'. */
 								static bool
 								dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
 								{
 								    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
 								}
 								static void
 								dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
 								{
 								    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
 								        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
 								    }
 								}
 								/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
 								 * fails, keeps checking for next node until reaching the end of cmap.
 								 *
 								 * Caller must unrefs the returned reference. */
 								static struct dp_netdev_pmd_thread *
 								dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
 								{
 								    struct dp_netdev_pmd_thread *next;
 								    do {
 								        struct cmap_node *node;
 								        node = cmap_next_position(&dp->poll_threads, pos);
 								        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
 								            : NULL;
 								    } while (next && !dp_netdev_pmd_try_ref(next));
 								    return next;
 								}
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								/* Configures the 'pmd' based on the input argument. */
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								                        unsigned core_id, int numa_id)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    pmd->dp = dp;
 								    pmd->core_id = core_id;
 								    pmd->numa_id = numa_id;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    pmd->poll_cnt = 0;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    atomic_init(&pmd->static_tx_qid,
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								                (core_id == NON_PMD_CORE_ID)
 								                ? ovs_numa_get_n_cores()
 								                : get_n_pmd_threads(dp));
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_refcount_init(&pmd->ref_cnt);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    latch_init(&pmd->exit_latch);
 								    atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    xpthread_cond_init(&pmd->cond, NULL);
 								    ovs_mutex_init(&pmd->cond_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_init(&pmd->flow_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_init(&pmd->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_init(&pmd->flow_table);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cmap_init(&pmd->classifiers);
 								    pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
-												list: Rename all functions in list.h with ovs_ prefix.

This attempts to prevent namespace collisions with other list libraries

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-03-25 14:10:22 -07:00
+								    ovs_list_init(&pmd->poll_list);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    hmap_init(&pmd->tx_ports);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_init(&pmd->tnl_port_cache);
 								    hmap_init(&pmd->send_port_cache);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* init the 'flow_cache' since there is no
 								     * actual thread created for NON_PMD_CORE_ID. */
 								    if (core_id == NON_PMD_CORE_ID) {
 								        emc_cache_init(&pmd->flow_cache);
 								    }
 								    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
 								                hash_int(core_id, 0));
 								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static void
 								dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dp_netdev_pmd_flow_flush(pmd);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_destroy(&pmd->send_port_cache);
 								    hmap_destroy(&pmd->tnl_port_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    hmap_destroy(&pmd->tx_ports);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* All flows (including their dpcls_rules) have been deleted already */
 								    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
 								        dpcls_destroy(cls);
-												dpif-netdev: Fix memory leak.

We keep all the per-port classifiers around, since they can be reused,
but when a pmd thread is destroyed we should free them.

Found using valgrind.

Fixes: 3453b4d62a98("dpif-netdev: dpcls per in_port with sorted
subtables")

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovsrcu_postpone(free, cls);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    }
 								    cmap_destroy(&pmd->classifiers);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_destroy(&pmd->flow_table);
 								    ovs_mutex_destroy(&pmd->flow_mutex);
 								    latch_destroy(&pmd->exit_latch);
 								    xpthread_cond_destroy(&pmd->cond);
 								    ovs_mutex_destroy(&pmd->cond_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_destroy(&pmd->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    free(pmd);
 								}
 								/* Stops the pmd thread, removes it from the 'dp->poll_threads',
 								 * and unrefs the struct. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								{
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
 								     * but extra cleanup is necessary */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Take non_pmd_mutex to access tx cached ports.

As documented in dp_netdev_pmd_thread, we must take non_pmd_mutex to
access the tx port caches for the non pmd thread.

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovs_mutex_lock(&dp->non_pmd_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        emc_cache_uninit(&pmd->flow_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        pmd_free_cached_ports(pmd);
-												dpif-netdev: Take non_pmd_mutex to access tx cached ports.

As documented in dp_netdev_pmd_thread, we must take non_pmd_mutex to
access the tx port caches for the non pmd thread.

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    } else {
 								        latch_set(&pmd->exit_latch);
 								        dp_netdev_reload_pmd__(pmd);
 								        ovs_numa_unpin_core(pmd->core_id);
 								        xpthread_join(pmd->thread, NULL);
 								    }
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    dp_netdev_pmd_clear_ports(pmd);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    /* Purges the 'pmd''s flows after stopping the thread, but before
 								     * destroying the flows, so that the flow stats can be collected. */
 								    if (dp->dp_purge_cb) {
 								        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								/* Destroys all pmd threads. */
 								static void
 								dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    struct dp_netdev_pmd_thread **pmd_list;
 								    size_t k = 0, n_pmds;
 								    n_pmds = cmap_count(&dp->poll_threads);
 								    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								        /* We cannot call dp_netdev_del_pmd(), since it alters
 								         * 'dp->poll_threads' (while we're iterating it) and it
 								         * might quiesce. */
 								        ovs_assert(k < n_pmds);
 								        pmd_list[k++] = pmd;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    }
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
 								    for (size_t i = 0; i < k; i++) {
 								        dp_netdev_del_pmd(dp, pmd_list[i]);
 								    }
 								    free(pmd_list);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								/* Deletes all pmd threads on numa node 'numa_id' and
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								 * fixes static_tx_qids of other threads to keep them sequential. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void
 								dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								    int n_pmds_on_numa, n_pmds;
 								    int *free_idx, k = 0;
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    struct dp_netdev_pmd_thread **pmd_list;
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
 								    n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
 								    pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								        /* We cannot call dp_netdev_del_pmd(), since it alters
 								         * 'dp->poll_threads' (while we're iterating it) and it
 								         * might quiesce. */
-												dpif-netdev: non-pmd thread static_tx_qid should be constant

The non-pmd thread static_tx_qid is assumed to be equal to the highest
core ID + 1. The function dp_netdev_del_pmds_on_numa() invalidates
this assumption by re-distributing the static_tx_qid:s on all pmd and
non-pmd threads of the "other" numa.

There might be a number of unwanted effects due to the non-pmd thread
static_tx_qid being changed. The actual fault, observed in OVS 2.5, was a
crash due to the TX burst queues containing a NULL packet buffer pointer
in the range of valid buffers, presumably caused by a race condition.

In OVS 2.6 TX burst queues have been removed, nevertheless the current
behavior is incorrect.

The correction makes dp_netdev_del_pmds_on_numa() honor the constancy
of the non-pmd static_tx_qid value by excluding all non-pmd threads
from the deletion and from the re-ordering of the static_tx_qid.

Signed-off-by: Patrik Andersson <patrik.r.andersson@ericsson.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-28 13:32:08 +02:00
+								        if (pmd->numa_id == numa_id && pmd->core_id != NON_PMD_CORE_ID) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            atomic_read_relaxed(&pmd->static_tx_qid, &free_idx[k]);
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								            pmd_list[k] = pmd;
 								            ovs_assert(k < n_pmds_on_numa);
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								            k++;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        }
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    }
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    for (int i = 0; i < k; i++) {
 								        dp_netdev_del_pmd(dp, pmd_list[i]);
 								    }
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								    n_pmds = get_n_pmd_threads(dp);
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        int old_tx_qid;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        atomic_read_relaxed(&pmd->static_tx_qid, &old_tx_qid);
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												dpif-netdev: non-pmd thread static_tx_qid should be constant

The non-pmd thread static_tx_qid is assumed to be equal to the highest
core ID + 1. The function dp_netdev_del_pmds_on_numa() invalidates
this assumption by re-distributing the static_tx_qid:s on all pmd and
non-pmd threads of the "other" numa.

There might be a number of unwanted effects due to the non-pmd thread
static_tx_qid being changed. The actual fault, observed in OVS 2.5, was a
crash due to the TX burst queues containing a NULL packet buffer pointer
in the range of valid buffers, presumably caused by a race condition.

In OVS 2.6 TX burst queues have been removed, nevertheless the current
behavior is incorrect.

The correction makes dp_netdev_del_pmds_on_numa() honor the constancy
of the non-pmd static_tx_qid value by excluding all non-pmd threads
from the deletion and from the re-ordering of the static_tx_qid.

Signed-off-by: Patrik Andersson <patrik.r.andersson@ericsson.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-28 13:32:08 +02:00
+								        if (old_tx_qid >= n_pmds && pmd->core_id != NON_PMD_CORE_ID) {
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								            int new_tx_qid = free_idx[--k];
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            atomic_store_relaxed(&pmd->static_tx_qid, new_tx_qid);
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								        }
 								    }
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    free(pmd_list);
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								    free(free_idx);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Deletes all rx queues from pmd->poll_list and all the ports from
 								 * pmd->tx_ports. */
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								{
 								    struct rxq_poll *poll;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *port;
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
 								        free(poll);
 								    }
 								    pmd->poll_cnt = 0;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
 								        free(port);
 								    }
 								    ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static struct tx_port *
 								tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
 								{
 								    struct tx_port *tx;
 								    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        if (tx->port->port_no == port_no) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            return tx;
 								        }
 								    }
 								    return NULL;
 								}
 								/* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
 								 * 'tx_ports' of 'pmd' thread.  Returns true if 'port' was found in 'pmd'
 								 * (therefore a restart is required). */
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								static bool
 								dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
 								                              struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								{
 								    struct rxq_poll *poll, *next;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *tx;
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    bool found = false;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
 								        if (poll->port == port) {
 								            found = true;
-												list: Rename all functions in list.h with ovs_ prefix.

This attempts to prevent namespace collisions with other list libraries

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-03-25 14:10:22 -07:00
+								            ovs_list_remove(&poll->node);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								            pmd->poll_cnt--;
 								            free(poll);
 								        }
 								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
 								    if (tx) {
 								        hmap_remove(&pmd->tx_ports, &tx->node);
 								        free(tx);
 								        found = true;
 								    }
 								    ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
 								    return found;
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
 								 * threads.  The pmd threads that need to be restarted are inserted in
 								 * 'to_reload'. */
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								static void
 								dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
 								                                   struct dp_netdev_port *port,
 								                                   struct hmapx *to_reload)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        bool found;
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        found = dp_netdev_del_port_from_pmd__(port, pmd);
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        if (found) {
 								            hmapx_add(to_reload, pmd);
 								        }
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    }
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
 								 * threads. Reloads the threads if needed. */
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void
 								dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
 								                                 struct dp_netdev_port *port)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
 								    struct hmapx_node *node;
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								    dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);
 								    HMAPX_FOR_EACH (node, &to_reload) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        dp_netdev_reload_pmd__(pmd);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    }
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
 								    hmapx_destroy(&to_reload);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								}
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Returns non-isolated PMD thread from this numa node with fewer
 								 * rx queues to poll. Returns NULL if there is no non-isolated  PMD threads
 								 * on this numa node. Can be called safely only by main thread. */
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static struct dp_netdev_pmd_thread *
 								dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
 								{
 								    int min_cnt = -1;
 								    struct dp_netdev_pmd_thread *pmd, *res = NULL;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        if (!pmd->isolated && pmd->numa_id == numa_id
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								            && (min_cnt > pmd->poll_cnt || res == NULL)) {
 								            min_cnt = pmd->poll_cnt;
 								            res = pmd;
 								        }
 								    }
 								    return res;
 								}
 								/* Adds rx queue to poll_list of PMD thread. */
 								static void
 								dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                         struct dp_netdev_port *port, struct netdev_rxq *rx)
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    OVS_REQUIRES(pmd->port_mutex)
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								{
 								    struct rxq_poll *poll = xmalloc(sizeof *poll);
 								    poll->port = port;
 								    poll->rx = rx;
-												list: Rename all functions in list.h with ovs_ prefix.

This attempts to prevent namespace collisions with other list libraries

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-03-25 14:10:22 -07:00
+								    ovs_list_push_back(&pmd->poll_list, &poll->node);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    pmd->poll_cnt++;
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
 								 * changes to take effect. */
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                             struct dp_netdev_port *port)
 								{
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct tx_port *tx;
 								    tx = xzalloc(sizeof *tx);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    tx->port = port;
 								    tx->qid = -1;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_unlock(&pmd->port_mutex);
 								}
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Distribute all {pinned|non-pinned} rx queues of 'port' between PMD
 								 * threads in 'dp'. The pmd threads that need to be restarted are inserted
 								 * in 'to_reload'. PMD threads with pinned queues marked as isolated. */
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void
 								dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
 								                              struct dp_netdev_port *port,
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								                              struct hmapx *to_reload, bool pinned)
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								{
 								    int numa_id = netdev_get_numa_id(port->netdev);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    int i;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    if (!netdev_is_pmd(port->netdev)) {
 								        return;
 								    }
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
-												dpif-netdev: Keep count of elements in port->rxq[].

This will ease deleting a port with no open rxqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 14:25:33 -08:00
+								    for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        if (pinned) {
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								            if (port->rxqs[i].core_id == OVS_CORE_UNSPEC) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								                continue;
 								            }
 								            pmd = dp_netdev_get_pmd(dp, port->rxqs[i].core_id);
 								            if (!pmd) {
 								                VLOG_WARN("There is no PMD thread on core %d. "
 								                          "Queue %d on port \'%s\' will not be polled.",
 								                          port->rxqs[i].core_id, i,
 								                          netdev_get_name(port->netdev));
 								                continue;
 								            }
 								            pmd->isolated = true;
 								            dp_netdev_pmd_unref(pmd);
 								        } else {
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								            if (port->rxqs[i].core_id != OVS_CORE_UNSPEC) {
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								                continue;
 								            }
 								            pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
 								            if (!pmd) {
 								                VLOG_WARN("There's no available pmd thread on numa node %d",
 								                          numa_id);
 								                break;
 								            }
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								        }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        dp_netdev_add_rxq_to_pmd(pmd, port, port->rxqs[i].rxq);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								        hmapx_add(to_reload, pmd);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    }
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								}
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Distributes all non-pinned rx queues of 'port' between all PMD threads
 								 * in 'dp' and inserts 'port' in the PMD threads 'tx_ports'. The pmd threads
 								 * that need to be restarted are inserted in 'to_reload'. */
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void
 								dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
 								                             struct hmapx *to_reload)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    dp_netdev_add_port_rx_to_pmds(dp, port, to_reload, false);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        dp_netdev_add_port_tx_to_pmd(pmd, port);
 								        hmapx_add(to_reload, pmd);
 								    }
 								}
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Distributes all non-pinned rx queues of 'port' between all PMD threads
 								 * in 'dp', inserts 'port' in the PMD threads 'tx_ports' and reloads them,
 								 * if needed. */
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								static void
 								dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
 								    struct hmapx_node *node;
 								    dp_netdev_add_port_to_pmds__(dp, port, &to_reload);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
 								    HMAPX_FOR_EACH (node, &to_reload) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        dp_netdev_reload_pmd__(pmd);
 								    }
 								    hmapx_destroy(&to_reload);
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Starts pmd threads for the numa node 'numa_id', if not already started.
 								 * The function takes care of filling the threads tx port cache. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void
 								dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    int n_pmds;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    if (!ovs_numa_numa_id_is_valid(numa_id)) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
 								                  numa_id);
 								        return;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
 								    n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
 								    /* If there are already pmd threads created for the numa node
 								     * in which 'netdev' is on, do nothing.  Else, creates the
 								     * pmd threads for the numa node. */
 								    if (!n_pmds) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        int can_have, n_unpinned, i;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
 								        n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
 								        if (!n_unpinned) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            VLOG_WARN("Cannot create pmd threads due to out of unpinned "
 								                      "cores on numa node %d", numa_id);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								            return;
 								        }
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								        /* If cpu mask is specified, uses all unpinned cores, otherwise
 								         * tries creating NR_PMD_THREADS pmd threads. */
 								        can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        for (i = 0; i < can_have; i++) {
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								            unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
 								            struct dp_netdev_port *port;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								            HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								                dp_netdev_add_port_tx_to_pmd(pmd, port);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								            }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        }
 								        VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    }
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								/* Called after pmd threads config change.  Restarts pmd threads with
 								 * new configuration. */
 								static void
 								dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								{
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    struct dp_netdev_port *port;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct hmapx_node *node;
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								        if (netdev_is_pmd(port->netdev)) {
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								            struct hmapx numas = HMAPX_INITIALIZER(&numas);
 								            struct hmapx_node *numa_node;
 								            uintptr_t numa_id;
 								            int i;
 								            numa_id = netdev_get_numa_id(port->netdev);
 								            hmapx_add(&numas, (void *) numa_id);
 								            for (i = 0; i < port->n_rxq; i++) {
 								                unsigned core_id = port->rxqs[i].core_id;
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								                if (core_id != OVS_CORE_UNSPEC) {
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								                    numa_id = ovs_numa_get_numa_id(core_id);
 								                    hmapx_add(&numas, (void *) numa_id);
 								                }
 								            }
 								            HMAPX_FOR_EACH (numa_node, &numas) {
 								                dp_netdev_set_pmds_on_numa(dp, (uintptr_t) numa_node->data);
 								            }
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								            hmapx_destroy(&numas);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								        }
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        /* Distribute only pinned rx queues first to mark threads as isolated */
 								        dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, true);
 								    }
 								    /* Distribute remaining non-pinned rx queues to non-isolated PMD threads. */
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, false);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    }
 								    HMAPX_FOR_EACH (node, &to_reload) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        dp_netdev_reload_pmd__(pmd);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    hmapx_destroy(&to_reload);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								}
-												bridge: Store datapath version into ovsdb

OVS userspace are backward compatible with older Linux kernel modules.
However, not having the most up-to-date datapath kernel modules can
some times lead to user confusion. Storing the datapath version in
OVSDB allows management software to check and optionally provide
notifications to users.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-16 15:23:11 -07:00
+								static char *
 								dpif_netdev_get_datapath_version(void)
 								{
 								     return xstrdup("<built-in>");
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
-												dpif-netdev: Cache time_msec() calls for each received batch.

Calling time_msec() (which calls clock_gettime()) too often might be
expensive.  With this commit OVS makes only one call per received
batch and caches the result.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:47 +01:00
+								                    uint16_t tcp_flags, long long now)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    uint16_t flags;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    atomic_store_relaxed(&netdev_flow->stats.used, now);
 								    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
 								    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
 								    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
 								    flags |= tcp_flags;
 								    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								}
 								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
 								                       enum dp_stat_type type, int cnt)
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								{
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    non_atomic_ullong_add(&pmd->stats.n[type], cnt);
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								}
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								static int
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                 enum dpif_upcall_type type, const struct nlattr *userdata,
 								                 struct ofpbuf *actions, struct ofpbuf *put_actions)
 								{
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev *dp = pmd->dp;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								    if (OVS_UNLIKELY(!dp->upcall_cb)) {
 								        return ENODEV;
 								    }
 								    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
 								        struct ds ds = DS_EMPTY_INITIALIZER;
 								        char *packet_str;
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								        struct ofpbuf key;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        struct odp_flow_key_parms odp_parms = {
 								            .flow = flow,
-												ovs-vswitchd: Avoid segfault for "netdev" datapath.

When the datapath, whose type is "netdev", processes packets
in userspce action, it may cause a segmentation fault. In the
dp_execute_userspace_action(), we pass the "wc" argument to
dp_netdev_upcall() using NULL. In the dp_netdev_upcall() call tree,
the "wc" will be used. For example, dp_netdev_upcall() uses the
&wc->masks for debugging, and flow_wildcards_init_for_packet()
uses the  "wc" if we disable megaflow, which is described in
more detail below.

Segmentation fault in flow_wildcards_init_for_packet:

    #0  0x0000000000468fe8 flow_wildcards_init_for_packet lib/flow.c:1275
    #1  0x0000000000436c0b upcall_cb ofproto/ofproto-dpif-upcall.c:1231
    #2  0x000000000045bd96 dp_netdev_upcall lib/dpif-netdev.c:3857
    #3  0x0000000000461bf3 dp_execute_userspace_action lib/dpif-netdev.c:4388
    #4  dp_execute_cb lib/dpif-netdev.c:4521
    #5  0x0000000000486ae2 odp_execute_actions lib/odp-execute.c:538
    #6  0x00000000004607f9 dp_netdev_execute_actions lib/dpif-netdev.c:4627
    #7  packet_batch_per_flow_execute lib/dpif-netdev.c:3927
    #8  dp_netdev_input__ lib/dpif-netdev.c:4229
    #9  0x0000000000460ba8 dp_netdev_input lib/dpif-netdev.c:4238
    #10 dp_netdev_process_rxq_port lib/dpif-netdev.c:2873
    #11 0x000000000046126e dpif_netdev_run lib/dpif-netdev.c:3000
    #12 0x000000000042baf5 type_run ofproto/ofproto-dpif.c:504
    #13 0x00000000004192bf ofproto_type_run ofproto/ofproto.c:1687
    #14 0x0000000000409965 bridge_run__ vswitchd/bridge.c:2875
    #15 0x000000000040f145 bridge_run vswitchd/bridge.c:2938
    #16 0x00000000004062e5 main vswitchd/ovs-vswitchd.c:111

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-12-07 10:04:04 -08:00
+								            .mask = wc ? &wc->masks : NULL,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								            .support = dp_netdev_support,
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        };
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								        ofpbuf_init(&key, 0);
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_flow(&odp_parms, &key);
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								        packet_str = ofp_packet_to_string(dp_packet_data(packet_),
 								                                          dp_packet_size(packet_));
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        odp_flow_key_format(key.data, key.size, &ds);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
 								                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
 								        ofpbuf_uninit(&key);
 								        free(packet_str);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_destroy(&ds);
 								    }
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
 								                         actions, wc, put_actions, dp->upcall_aux);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline uint32_t
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
+								dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
 								                                const struct miniflow *mf)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
+								    uint32_t hash, recirc_depth;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Check for PKT_RX_RSS_HASH flag.

DPDK mbufs contain a valid RSS hash only if PKT_RX_RSS_HASH is
set in 'ol_flags'.  Otherwise the hash is garbage and doesn't
relate to the packet.

This fixes an issue with vhost, which, being a virtual NIC, doesn't
compute the hash.

Reported-by: Dongjun <dongj@dtdream.com>
Suggested-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Kevin Traynor <kevin.traynor@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-06-16 19:16:24 +01:00
+								    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
 								        hash = dp_packet_get_rss_hash(packet);
 								    } else {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        hash = miniflow_hash_5tuple(mf, 0);
-												dp-packet: Rename 'dp_hash' in 'rss_hash'.

We already have the 'dp_hash' embedded in the metadata.  This caused
confusion in the code.  With this commit it should be clear that
'rss_hash' is the packet hash used for internal purposes, while
'md.dp_hash' is part of the flow, computed during the execution of
certain actions.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:48 +01:00
+								        dp_packet_set_rss_hash(packet, hash);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
 								    /* The RSS hash must account for the recirculation depth to avoid
 								     * collisions in the exact match cache */
 								    recirc_depth = *recirc_depth_get_unsafe();
 								    if (OVS_UNLIKELY(recirc_depth)) {
 								        hash = hash_finish(hash, recirc_depth);
 								        dp_packet_set_rss_hash(packet, hash);
 								    }
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    return hash;
 								}
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								struct packet_batch_per_flow {
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    unsigned int byte_count;
 								    uint16_t tcp_flags;
 								    struct dp_netdev_flow *flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch array;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								};
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
 								                             struct dp_packet *packet,
 								                             const struct miniflow *mf)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    batch->byte_count += dp_packet_size(packet);
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    batch->tcp_flags |= miniflow_get_tcp_flags(mf);
 								    batch->array.packets[batch->array.count++] = packet;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								}
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
 								                           struct dp_netdev_flow *flow)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    flow->batch = batch;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    batch->flow = flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_packet_batch_init(&batch->array);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    batch->byte_count = 0;
 								    batch->tcp_flags = 0;
 								}
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
 								                              struct dp_netdev_pmd_thread *pmd,
 								                              long long now)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
 								    struct dp_netdev_actions *actions;
 								    struct dp_netdev_flow *flow = batch->flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
-												dpif-netdev: Cache time_msec() calls for each received batch.

Calling time_msec() (which calls clock_gettime()) too often might be
expensive.  With this commit OVS makes only one call per received
batch and caches the result.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:47 +01:00
+								                        batch->tcp_flags, now);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
 								    actions = dp_netdev_flow_get_actions(flow);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                              actions->actions, actions->size, now);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								}
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								static inline void
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								dp_netdev_queue_batches(struct dp_packet *pkt,
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								                        struct dp_netdev_flow *flow, const struct miniflow *mf,
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								                        struct packet_batch_per_flow *batches, size_t *n_batches)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								    struct packet_batch_per_flow *batch = flow->batch;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
-												dpif-netdev: Reduce code duplication

Code clean up to reduce code duplication.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:51:11 -08:00
+								    if (OVS_UNLIKELY(!batch)) {
 								        batch = &batches[(*n_batches)++];
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								        packet_batch_per_flow_init(batch, flow);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								    packet_batch_per_flow_update(batch, pkt, mf);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
 								/* Try to process all ('cnt') the 'packets' using only the exact match cache
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								 * miniflow is copied into 'keys' and the packet pointer is moved at the
 								 * beginning of the 'packets' array.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 *
 								 * The function returns the number of packets that needs to be processed in the
 								 * 'packets' array (they have been moved to the beginning of the vector).
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								 *
 								 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
 								 * initialized by this function using 'port_no'.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 */
 								static inline size_t
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
 								               struct netdev_flow_key *keys,
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								               struct packet_batch_per_flow batches[], size_t *n_batches,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								               bool md_is_valid, odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct emc_cache *flow_cache = &pmd->flow_cache;
-												dpif-netdev: optmizing emc_processing()

Commit d262ac2c60ce1da7b477737f70e8efd38b32502d introduced a slight
performance drop for the fast path, where every packets hits the
emc cache.  This patch removes that performance drop by only reloading
the key pointer on emc cache miss.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:48:50 -08:00
+								    struct netdev_flow_key *key = &keys[0];
-												dpif-netdev: properly maintain exact match cache hit counter

Current logic counts dropped packet as cache hit which is not
correct. This patch removes dropped packet to improve accuracy.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:43:47 -08:00
+								    size_t i, n_missed = 0, n_dropped = 0;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet **packets = packets_->packets;
 								    int cnt = packets_->count;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								    for (i = 0; i < cnt; i++) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        struct dp_netdev_flow *flow;
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        struct dp_packet *packet = packets[i];
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
 								            dp_packet_delete(packet);
-												dpif-netdev: properly maintain exact match cache hit counter

Current logic counts dropped packet as cache hit which is not
correct. This patch removes dropped packet to improve accuracy.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:43:47 -08:00
+								            n_dropped++;
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								            continue;
 								        }
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Prefetch next packet before miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-15 19:06:39 +01:00
+								        if (i != cnt - 1) {
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								            /* Prefetch next packet data and metadata. */
-												dpif-netdev: Prefetch next packet before miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-15 19:06:39 +01:00
+								            OVS_PREFETCH(dp_packet_data(packets[i+1]));
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								            pkt_metadata_prefetch_init(&packets[i+1]->md);
-												dpif-netdev: Prefetch next packet before miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-15 19:06:39 +01:00
+								        }
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								        if (!md_is_valid) {
 								            pkt_metadata_init(&packet->md, port_no);
 								        }
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        miniflow_extract(packet, &key->mf);
-												dpif-netdev: Avoid copying netdev_flow_key in emc_processing().

Before this commit, emc_processing() copied a netdev_flow_key if there was
no exact-match cache (EMC) hit.  This commit eliminates the copy by
constructing the netdev_flow_key in the place it would be copied.

Found by inspection.

Shahbaz (CCed) reports that this reduces the cost of an EMC miss by 72
cycles in his test case in which the EMC is disabled.  Presumably this
is similarly valuable in cases where the EMC merely has few hits.

For the original version of this patch, which was against a slightly
earlier version of OVS, Daniele reported that:

    - With EMC disabled, this increases throughput from 4.8 Mpps to 5.4
      Mpps.

    - With EMC enabled, this decreases throughput from 12.4 to 12.0 Mpps.

CC: Muhammad Shahbaz <mshahbaz@cs.princeton.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-24 08:32:36 -08:00
+								        key->len = 0; /* Not computed yet. */
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Avoid copying netdev_flow_key in emc_processing().

Before this commit, emc_processing() copied a netdev_flow_key if there was
no exact-match cache (EMC) hit.  This commit eliminates the copy by
constructing the netdev_flow_key in the place it would be copied.

Found by inspection.

Shahbaz (CCed) reports that this reduces the cost of an EMC miss by 72
cycles in his test case in which the EMC is disabled.  Presumably this
is similarly valuable in cases where the EMC merely has few hits.

For the original version of this patch, which was against a slightly
earlier version of OVS, Daniele reported that:

    - With EMC disabled, this increases throughput from 4.8 Mpps to 5.4
      Mpps.

    - With EMC enabled, this decreases throughput from 12.4 to 12.0 Mpps.

CC: Muhammad Shahbaz <mshahbaz@cs.princeton.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-24 08:32:36 -08:00
+								        flow = emc_lookup(flow_cache, key);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								        if (OVS_LIKELY(flow)) {
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								            dp_netdev_queue_batches(packet, flow, &key->mf, batches,
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								                                    n_batches);
 								        } else {
-												dpif-netdev: drop swapping

emc_processing() moves all the missed packets towards the beginning of
packet array; matched packets are queued up into flow queues. Since the
remaining of the packet array is not used anymore, don't bother swap
packet pointers to save cycles and simplify logic.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:48:19 -08:00
+								            /* Exact match cache missed. Group missed packets together at
 								             * the beginning of the 'packets' array.  */
-												dpif-netdev: optmizing emc_processing()

Commit d262ac2c60ce1da7b477737f70e8efd38b32502d introduced a slight
performance drop for the fast path, where every packets hits the
emc cache.  This patch removes that performance drop by only reloading
the key pointer on emc cache miss.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:48:50 -08:00
+								            packets[n_missed] = packet;
-												dpif-netdev: Correctly update 'key' in emc_processing().

The 'key' pointer must point at the first unused element in the key
array.

Fixes: b89c678b7a26 ("dpif-netdev: optmizing emc_processing()")
CC: Andy Zhou <azhou@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2016-02-02 20:38:11 -08:00
+								            /* 'key[n_missed]' contains the key of the current packet and it
 								             * must be returned to the caller. The next key should be extracted
 								             * to 'keys[n_missed + 1]'. */
 								            key = &keys[++n_missed];
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        }
 								    }
-												dpif-netdev: properly maintain exact match cache hit counter

Current logic counts dropped packet as cache hit which is not
correct. This patch removes dropped packet to improve accuracy.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:43:47 -08:00
+								    dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
-												dpif-netdev: Use miniflow as a flow key.

Use miniflow as a flow key in the userspace datapath classifier.  The
miniflow is expanded for upcalls, but for existing datapath flows, the
key need not be expanded.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Reviewed-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
											
										
										
											2014-04-18 08:26:57 -07:00
-												dpif-netdev: properly maintain exact match cache hit counter

Current logic counts dropped packet as cache hit which is not
correct. This patch removes dropped packet to improve accuracy.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:43:47 -08:00
+								    return n_missed;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								static inline void
 								handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
 								                     const struct netdev_flow_key *key,
 								                     struct ofpbuf *actions, struct ofpbuf *put_actions,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                     int *lost_cnt, long long now)
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								{
 								    struct ofpbuf *add_actions;
 								    struct dp_packet_batch b;
 								    struct match match;
 								    ovs_u128 ufid;
 								    int error;
 								    match.tun_md.valid = false;
 								    miniflow_expand(&key->mf, &match.flow);
 								    ofpbuf_clear(actions);
 								    ofpbuf_clear(put_actions);
 								    dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
 								    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
 								                             &ufid, DPIF_UC_MISS, NULL, actions,
 								                             put_actions);
 								    if (OVS_UNLIKELY(error && error != ENOSPC)) {
 								        dp_packet_delete(packet);
 								        (*lost_cnt)++;
 								        return;
 								    }
 								    /* The Netlink encoding of datapath flow keys cannot express
 								     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
 								     * tag is interpreted as exact match on the fact that there is no
 								     * VLAN.  Unless we refactor a lot of code that translates between
 								     * Netlink and struct flow representations, we have to do the same
 								     * here. */
 								    if (!match.wc.masks.vlan_tci) {
 								        match.wc.masks.vlan_tci = htons(0xffff);
 								    }
 								    /* We can't allow the packet batching in the next loop to execute
 								     * the actions.  Otherwise, if there are any slow path actions,
 								     * we'll send the packet up twice. */
 								    packet_batch_init_packet(&b, packet);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                              actions->data, actions->size, now);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
 								    add_actions = put_actions->size ? put_actions : actions;
 								    if (OVS_LIKELY(error != ENOSPC)) {
 								        struct dp_netdev_flow *netdev_flow;
 								        /* XXX: There's a race window where a flow covering this packet
 								         * could have already been installed since we last did the flow
 								         * lookup before upcall.  This could be solved by moving the
 								         * mutex lock outside the loop, but that's an awful long time
 								         * to be locking everyone out of making flow installs.  If we
 								         * move to a per-core classifier, it would be reasonable. */
 								        ovs_mutex_lock(&pmd->flow_mutex);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								        if (OVS_LIKELY(!netdev_flow)) {
 								            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
 								                                             add_actions->data,
 								                                             add_actions->size);
 								        }
 								        ovs_mutex_unlock(&pmd->flow_mutex);
 								        emc_insert(&pmd->flow_cache, key, netdev_flow);
 								    }
 								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								fast_path_processing(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                     struct dp_packet_batch *packets_,
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								                     struct netdev_flow_key *keys,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                     struct packet_batch_per_flow batches[], size_t *n_batches,
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                     odp_port_t in_port,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                     long long now)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    int cnt = packets_->count;
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								#if !defined(__CHECKER__) && !defined(_WIN32)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    const size_t PKT_ARRAY_SIZE = cnt;
 								#else
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								    /* Sparse or MSVC doesn't like variable length array. */
-												dpdk: Ditch MAX_PKT_BURST macro.

The MAX_PKT_BURST and NETDEV_MAX_RX_BATCH macros had a confusing
relationship.  They basically purport to do the same thing, making it
unclear which is the source of truth.

Furthermore, while NETDEV_MAX_RX_BATCH was 256, MAX_PKT_BURST was 32,
meaning we never process a batch larger than 32 packets further adding
to the confusion.

This patch resolves the issue by removing MAX_PKT_BURST completely,
and shrinking the new NETDEV_MAX_BURST macro to only 32.  This should
have no change in the execution path except shrinking a couple of
structs and memory allocations (can't hurt).

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-16 08:18:20 -07:00
+								    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#endif
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet **packets = packets_->packets;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev *dp = pmd->dp;
 								    struct emc_cache *flow_cache = &pmd->flow_cache;
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    int miss_cnt = 0, lost_cnt = 0;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    int lookup_cnt = 0, add_lookup_cnt;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    bool any_miss;
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    size_t i;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
 								    for (i = 0; i < cnt; i++) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        /* Key length is needed in all the cases, hash computed on demand. */
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								        keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Get the classifier for the in_port */
 								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    if (OVS_LIKELY(cls)) {
 								        any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
 								    } else {
 								        any_miss = true;
 								        memset(rules, 0, sizeof(rules));
 								    }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
 								        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
 								        struct ofpbuf actions, put_actions;
 								        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
 								        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
 								        for (i = 0; i < cnt; i++) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            struct dp_netdev_flow *netdev_flow;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            if (OVS_LIKELY(rules[i])) {
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                continue;
 								            }
 								            /* It's possible that an earlier slow path execution installed
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								             * a rule covering this flow.  In this case, it's a lot cheaper
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								             * to catch it here than execute a miss. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
 								                                                    &add_lookup_cnt);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            if (netdev_flow) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                lookup_cnt += add_lookup_cnt;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                rules[i] = &netdev_flow->cr;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                continue;
 								            }
-												dpif-netdev: Group statistics updates in the slow path.

Since statistics updates might require locking (in future commits)
grouping them will reduce the locking overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:13 +01:00
+								            miss_cnt++;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
 								                                 &put_actions, &lost_cnt, now);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        }
 								        ofpbuf_uninit(&actions);
 								        ofpbuf_uninit(&put_actions);
 								        fat_rwlock_unlock(&dp->upcall_rwlock);
-												dpif-netdev: Group statistics updates in the slow path.

Since statistics updates might require locking (in future commits)
grouping them will reduce the locking overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:13 +01:00
+								        dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								    } else if (OVS_UNLIKELY(any_miss)) {
 								        for (i = 0; i < cnt; i++) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            if (OVS_UNLIKELY(!rules[i])) {
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								                dp_packet_delete(packets[i]);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								                lost_cnt++;
 								                miss_cnt++;
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            }
 								        }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    }
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    for (i = 0; i < cnt; i++) {
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								        struct dp_packet *packet = packets[i];
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								        struct dp_netdev_flow *flow;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        if (OVS_UNLIKELY(!rules[i])) {
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								            continue;
 								        }
 								        flow = dp_netdev_flow_cast(rules[i]);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								        emc_insert(flow_cache, &keys[i], flow);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								        dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    }
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
 								    dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								/* Packets enter the datapath from a port (or from recirculation) here.
 								 *
 								 * For performance reasons a caller may choose not to initialize the metadata
 								 * in 'packets': in this case 'mdinit' is false and this function needs to
 								 * initialize it using 'port_no'.  If the metadata in 'packets' is already
 								 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								static void
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                  struct dp_packet_batch *packets,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								                  bool md_is_valid, odp_port_t port_no)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    int cnt = packets->count;
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								#if !defined(__CHECKER__) && !defined(_WIN32)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    const size_t PKT_ARRAY_SIZE = cnt;
 								#else
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								    /* Sparse or MSVC doesn't like variable length array. */
-												dpdk: Ditch MAX_PKT_BURST macro.

The MAX_PKT_BURST and NETDEV_MAX_RX_BATCH macros had a confusing
relationship.  They basically purport to do the same thing, making it
unclear which is the source of truth.

Furthermore, while NETDEV_MAX_RX_BATCH was 256, MAX_PKT_BURST was 32,
meaning we never process a batch larger than 32 packets further adding
to the confusion.

This patch resolves the issue by removing MAX_PKT_BURST completely,
and shrinking the new NETDEV_MAX_BURST macro to only 32.  This should
have no change in the execution path except shrinking a couple of
structs and memory allocations (can't hurt).

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-16 08:18:20 -07:00
+								    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#endif
-												dpif-netdev: Fix windows build.

OVS_ALIGNED_VAR(...) should be at the beginning of a definition, as
the example in include/openvswitch/compiler.h shows.

Fixes: 38ee0814978c ("dpif-netdev: Cache align netdev_flow_keys.")
Reported-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Acked-by: Sairam Venugopal <vsairam@vmware.com>

											
										
										
											2016-10-19 11:31:33 -07:00
+								    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct netdev_flow_key keys[PKT_ARRAY_SIZE];
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
-												dpif-netdev: Cache time_msec() calls for each received batch.

Calling time_msec() (which calls clock_gettime()) too often might be
expensive.  With this commit OVS makes only one call per received
batch and caches the result.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:47 +01:00
+								    long long now = time_msec();
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    size_t newcnt, n_batches, i;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    odp_port_t in_port;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    n_batches = 0;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								                            md_is_valid, port_no);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    if (OVS_UNLIKELY(newcnt)) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								        packets->count = newcnt;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        /* Get ingress port from first packet's metadata. */
 								        in_port = packets->packets[0]->md.in_port.odp_port;
 								        fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    }
-												dpif-netdev: Add comments to dp_netdev_input__().

Add comments in dp_netdev_input__() to explain the reason behind
clearing the flow batches before packet_batch_execute().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-14 15:37:08 +01:00
+								    /* All the flow batches need to be reset before any call to
 								     * packet_batch_per_flow_execute() as it could potentially trigger
 								     * recirculation. When a packet matching flow ‘j’ happens to be
 								     * recirculated, the nested call to dp_netdev_input__() could potentially
 								     * classify the packet as matching another flow - say 'k'. It could happen
 								     * that in the previous call to dp_netdev_input__() that same flow 'k' had
 								     * already its own batches[k] still waiting to be served.  So if its
 								     * ‘batch’ member is not reset, the recirculated packet would be wrongly
 								     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
-												dpif-netdev: Clear flow batches before execute.

When executing actions, it's possible a recirculation will occur
causing dp_netdev_input() to be called multiple times.  If the batch
pointers embedded in dp_netdev_flow aren't cleared, it's possible
packets after the recirculation will be reinserted into a batch
associated with the original lookup.  This could be very bad.

This patch fixes the problem by zeroing out flow batch pointers before
calling packet_batch_execute().  This probably has a slightly negative
performance impact, though I haven't tried it.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-20 16:55:17 -07:00
+								    for (i = 0; i < n_batches; i++) {
 								        batches[i].flow->batch = NULL;
 								    }
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    for (i = 0; i < n_batches; i++) {
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								        packet_batch_per_flow_execute(&batches[i], pmd, now);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								static void
 								dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                struct dp_packet_batch *packets,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								                odp_port_t port_no)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    dp_netdev_input__(pmd, packets, false, port_no);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								}
 								static void
 								dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                      struct dp_packet_batch *packets)
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    dp_netdev_input__(pmd, packets, true, 0);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								}
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								struct dp_netdev_execute_aux {
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    long long now;
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    const struct flow *flow;
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								};
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								static void
 								dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
 								                                 void *aux)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp->dp_purge_aux = aux;
 								    dp->dp_purge_cb = cb;
 								}
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
 								                               void *aux)
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    dp->upcall_aux = aux;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dp->upcall_cb = cb;
 								}
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								static void
 								dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 								                               long long now, bool purge)
 								{
 								    struct tx_port *tx;
 								    struct dp_netdev_port *port;
 								    long long interval;
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
-												dpif-netdev: Fix xps revalidation.

Revalidation should work in case of 'dynamic_txqs == true'.

Fixes: 324c8374852a ("dpif-netdev: XPS (Transmit Packet Steering) implementation.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-29 11:07:21 +03:00
+								        if (!tx->port->dynamic_txqs) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            continue;
 								        }
 								        interval = now - tx->last_used;
 								        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
 								            port = tx->port;
 								            ovs_mutex_lock(&port->txq_used_mutex);
 								            port->txq_used[tx->qid]--;
 								            ovs_mutex_unlock(&port->txq_used_mutex);
 								            tx->qid = -1;
 								        }
 								    }
 								}
 								static int
 								dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 								                           struct tx_port *tx, long long now)
 								{
 								    struct dp_netdev_port *port;
 								    long long interval;
 								    int i, min_cnt, min_qid;
 								    if (OVS_UNLIKELY(!now)) {
 								        now = time_msec();
 								    }
 								    interval = now - tx->last_used;
 								    tx->last_used = now;
 								    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
 								        return tx->qid;
 								    }
 								    port = tx->port;
 								    ovs_mutex_lock(&port->txq_used_mutex);
 								    if (tx->qid >= 0) {
 								        port->txq_used[tx->qid]--;
 								        tx->qid = -1;
 								    }
 								    min_cnt = -1;
 								    min_qid = 0;
 								    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
 								        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
 								            min_cnt = port->txq_used[i];
 								            min_qid = i;
 								        }
 								    }
 								    port->txq_used[min_qid]++;
 								    tx->qid = min_qid;
 								    ovs_mutex_unlock(&port->txq_used_mutex);
 								    dpif_netdev_xps_revalidate_pmd(pmd, now, false);
 								    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
 								             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
 								    return min_qid;
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static struct tx_port *
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
 								                          odp_port_t port_no)
 								{
 								    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
 								}
 								static struct tx_port *
 								pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
 								                           odp_port_t port_no)
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								{
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    return tx_port_lookup(&pmd->send_port_cache, port_no);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								static int
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                const struct nlattr *attr,
 								                struct dp_packet_batch *batch)
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								{
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *tun_port;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    const struct ovs_action_push_tnl *data;
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								    int err;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
 								    data = nl_attr_get(attr);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    tun_port = pmd_tnl_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    if (!tun_port) {
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								        err = -EINVAL;
 								        goto error;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    err = netdev_push_header(tun_port->port->netdev, batch, data);
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								    if (!err) {
 								        return 0;
 								    }
 								error:
 								    dp_packet_delete_batch(batch, true);
 								    return err;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								}
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								static void
 								dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
 								                            struct dp_packet *packet, bool may_steal,
 								                            struct flow *flow, ovs_u128 *ufid,
 								                            struct ofpbuf *actions,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                            const struct nlattr *userdata, long long now)
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								{
 								    struct dp_packet_batch b;
 								    int error;
 								    ofpbuf_clear(actions);
 								    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
 								                             DPIF_UC_ACTION, userdata, actions,
 								                             NULL);
 								    if (!error || error == ENOSPC) {
 								        packet_batch_init_packet(&b, packet);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								        dp_netdev_execute_actions(pmd, &b, may_steal, flow,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                                  actions->data, actions->size, now);
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								    } else if (may_steal) {
 								        dp_packet_delete(packet);
 								    }
 								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								static void
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								              const struct nlattr *a, bool may_steal)
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								{
 								    struct dp_netdev_execute_aux *aux = aux_;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    uint32_t *depth = recirc_depth_get();
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
+								    struct dp_netdev_pmd_thread *pmd = aux->pmd;
 								    struct dp_netdev *dp = pmd->dp;
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    int type = nl_attr_type(a);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    long long now = aux->now;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *p;
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    switch ((enum ovs_action_attr)type) {
 								    case OVS_ACTION_ATTR_OUTPUT:
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
-												dpif-netdev: delete lost packets in dp_execute_cb()

This commit fixes memory leaks in dp_execute_cb() in two cases:
    - when the output port cannot be found
    - when the recirculation depth is exceeded

Reported-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-25 11:39:34 -07:00
+								        if (OVS_LIKELY(p)) {
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
+								            int tx_qid;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            bool dynamic_txqs;
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            dynamic_txqs = p->port->dynamic_txqs;
 								            if (dynamic_txqs) {
 								                tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
 								            } else {
 								                atomic_read_relaxed(&pmd->static_tx_qid, &tx_qid);
 								            }
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
 								                        dynamic_txqs);
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        }
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								        break;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    case OVS_ACTION_ATTR_TUNNEL_PUSH:
 								        if (*depth < MAX_RECIRC_DEPTH) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            struct dp_packet_batch tnl_pkt;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch *orig_packets_ = packets_;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								            int err;
 								            if (!may_steal) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                dp_packet_batch_clone(&tnl_pkt, packets_);
 								                packets_ = &tnl_pkt;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                dp_packet_batch_reset_cutlen(orig_packets_);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								            }
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            dp_packet_batch_apply_cutlen(packets_);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								            err = push_tnl_action(pmd, a, packets_);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								            if (!err) {
 								                (*depth)++;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                dp_netdev_recirculate(pmd, packets_);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                (*depth)--;
 								            }
 								            return;
 								        }
 								        break;
 								    case OVS_ACTION_ATTR_TUNNEL_POP:
 								        if (*depth < MAX_RECIRC_DEPTH) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch *orig_packets_ = packets_;
-												lib: Use nl_attr_get_odp_port().

This helper is a little tidier than the alternative. Use it treewide.

Signed-off-by: Joe Stringer <joe@ovn.org>
Acked-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2016-11-14 13:29:05 -08:00
+								            odp_port_t portno = nl_attr_get_odp_port(a);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            p = pmd_tnl_port_cache_lookup(pmd, portno);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								            if (p) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                struct dp_packet_batch tnl_pkt;
-												dpif-netdev: Fix memory leak in tunnel header pop action.

The tunnel header pop action can leak batch of packet
in case of error. Following patch fixex the error code path.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:37 -07:00
+								                int i;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
 								                if (!may_steal) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                    dp_packet_batch_clone(&tnl_pkt, packets_);
 								                    packets_ = &tnl_pkt;
 								                    dp_packet_batch_reset_cutlen(orig_packets_);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                }
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                dp_packet_batch_apply_cutlen(packets_);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                netdev_pop_header(p->port->netdev, packets_);
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                if (!packets_->count) {
-												netdev: Return number of packet from netdev_pop_header()

Current tunnel-pop API does not allow the netdev implementation
retain a packet but STT can keep a packet from batch of packets
during TCP reassembly processing. To return exact count of
valid packet STT need to pass this number of packet parameter
as a reference.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:06 -07:00
+								                    return;
 								                }
-												dpif-netdev: Fix memory leak in tunnel header pop action.

The tunnel header pop action can leak batch of packet
in case of error. Following patch fixex the error code path.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:37 -07:00
 								                for (i = 0; i < packets_->count; i++) {
 								                    packets_->packets[i]->md.in_port.odp_port = portno;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                }
-												dpif-netdev: Fix memory leak in tunnel header pop action.

The tunnel header pop action can leak batch of packet
in case of error. Following patch fixex the error code path.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:37 -07:00
 								                (*depth)++;
 								                dp_netdev_recirculate(pmd, packets_);
 								                (*depth)--;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                return;
 								            }
 								        }
 								        break;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    case OVS_ACTION_ATTR_USERSPACE:
 								        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch *orig_packets_ = packets_;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            struct dp_packet **packets = packets_->packets;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            const struct nlattr *userdata;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch usr_pkt;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            struct ofpbuf actions;
 								            struct flow flow;
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								            ovs_u128 ufid;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            bool clone = false;
-												netdev: Return number of packet from netdev_pop_header()

Current tunnel-pop API does not allow the netdev implementation
retain a packet but STT can keep a packet from batch of packets
during TCP reassembly processing. To return exact count of
valid packet STT need to pass this number of packet parameter
as a reference.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:06 -07:00
+								            int i;
-												odp-execute: Refine signatures for odp_execute_actions() callbacks.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-20 12:47:33 -07:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
 								            ofpbuf_init(&actions, 0);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            if (packets_->trunc) {
 								                if (!may_steal) {
 								                    dp_packet_batch_clone(&usr_pkt, packets_);
 								                    packets_ = &usr_pkt;
 								                    packets = packets_->packets;
 								                    clone = true;
 								                    dp_packet_batch_reset_cutlen(orig_packets_);
 								                }
 								                dp_packet_batch_apply_cutlen(packets_);
 								            }
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            for (i = 0; i < packets_->count; i++) {
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								                flow_extract(packets[i], &flow);
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								                dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                                            &ufid, &actions, userdata, now);
-												netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads

DPDK mempools rely on rte_lcore_id() to implement a thread-local cache.
Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to
the "thread-local" cache, causing crashes.

This commit resolves the issue with the following changes:

- Every non pmd thread has the same lcore_id (0, for management reasons), which
  is not shared with any pmd thread (lcore_id for pmd threads now start from 1)
- DPDK mbufs must be allocated/freed in pmd threads. When there is the need to
  use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be
  held.
- The previous change does not allow us anymore to pass DPDK mbufs to handler
  threads: therefore this commit partially revert 143859ec63d45e. Now packets
  are copied for upcall processing. We can remove the extra memcpy by
  processing upcalls in the pmd thread itself.

With the introduction of the extra locking, the packet throughput will be lower
in the following cases:

- When using internal (tap) devices with DPDK devices on the same datapath.
  Anyway, to support internal devices efficiently, we needed DPDK KNI devices,
  which will be proper pmd devices and will not need this locking.
- When packets are processed in the slow path by non pmd threads. This overhead
  can be avoided by handling the upcalls directly in pmd threads (a change that
  has already been proposed by Ryan Wilson)

Also, the following two fixes have been introduced:
- In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put().
  This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option
- Do not bulk free mbufs in a transmission queue. They may belong to different
  mempools

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-07-17 14:29:36 -07:00
+								            }
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
 								            if (clone) {
 								                dp_packet_delete_batch(packets_, true);
 								            }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            ofpbuf_uninit(&actions);
 								            fat_rwlock_unlock(&dp->upcall_rwlock);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
 								        }
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								        break;
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								    case OVS_ACTION_ATTR_RECIRC:
 								        if (*depth < MAX_RECIRC_DEPTH) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            struct dp_packet_batch recirc_pkts;
-												netdev: Return number of packet from netdev_pop_header()

Current tunnel-pop API does not allow the netdev implementation
retain a packet but STT can keep a packet from batch of packets
during TCP reassembly processing. To return exact count of
valid packet STT need to pass this number of packet parameter
as a reference.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:06 -07:00
+								            int i;
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
+								            if (!may_steal) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								               dp_packet_batch_clone(&recirc_pkts, packets_);
 								               packets_ = &recirc_pkts;
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
+								            }
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            for (i = 0; i < packets_->count; i++) {
 								                packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								            }
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
 								            (*depth)++;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            dp_netdev_recirculate(pmd, packets_);
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								            (*depth)--;
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								        }
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
 								        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
+								        break;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    case OVS_ACTION_ATTR_CT: {
 								        const struct nlattr *b;
 								        bool commit = false;
 								        unsigned int left;
 								        uint16_t zone = 0;
 								        const char *helper = NULL;
 								        const uint32_t *setmark = NULL;
 								        const struct ovs_key_ct_labels *setlabel = NULL;
 								        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
 								                                 nl_attr_get_size(a)) {
 								            enum ovs_ct_attr sub_type = nl_attr_type(b);
 								            switch(sub_type) {
 								            case OVS_CT_ATTR_COMMIT:
 								                commit = true;
 								                break;
 								            case OVS_CT_ATTR_ZONE:
 								                zone = nl_attr_get_u16(b);
 								                break;
 								            case OVS_CT_ATTR_HELPER:
 								                helper = nl_attr_get_string(b);
 								                break;
 								            case OVS_CT_ATTR_MARK:
 								                setmark = nl_attr_get(b);
 								                break;
 								            case OVS_CT_ATTR_LABELS:
 								                setlabel = nl_attr_get(b);
 								                break;
 								            case OVS_CT_ATTR_NAT:
 								            case OVS_CT_ATTR_UNSPEC:
 								            case __OVS_CT_ATTR_MAX:
 								                OVS_NOT_REACHED();
 								            }
 								        }
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								        conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, commit,
 								                          zone, setmark, setlabel, helper);
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
+								        break;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    }
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_PUSH_VLAN:
 								    case OVS_ACTION_ATTR_POP_VLAN:
 								    case OVS_ACTION_ATTR_PUSH_MPLS:
 								    case OVS_ACTION_ATTR_POP_MPLS:
 								    case OVS_ACTION_ATTR_SET:
-												lib/odp: Masked set action execution and printing.

Add a new action type OVS_ACTION_ATTR_SET_MASKED, and support for
parsing, printing, and committing them.

Masked set actions add a mask, immediately following the netlink
attribute data, within the netlink attribute itself.  Thus the key
attribute size for a masked set action is exactly double of the
non-masked set action.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-09-05 15:44:19 -07:00
+								    case OVS_ACTION_ATTR_SET_MASKED:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_SAMPLE:
-												dpif-netdev: Remove redundant hash action handling.

odp_execute_actions() already handles hash execution part.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-11-13 15:03:39 -08:00
+								    case OVS_ACTION_ATTR_HASH:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_UNSPEC:
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								    case OVS_ACTION_ATTR_TRUNC:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case __OVS_ACTION_ATTR_MAX:
 								        OVS_NOT_REACHED();
-												dpif: Allow execute to modify the packet.

Allowing the packet to be modified by execution allows less data
copying for userspace action execution.  Some users of the
dpif_execute already expect that the packet may be modified.  This
patch makes this behavior uniform and makes the userspace datapath and
the execution helpers modify the packet as it is being executed.
Userspace action now steals the packet if given permission, as the
packet is normally not needed after it.  The only exception is the
sample action, and this is accounted for my keeping track of any
actions that could be following the userspace action.

The packet in dpif_upcall is changed from a pointer to a struct,
allowing the packet to be honest about it's headroom.  After this
change the packet can safely be pushed on over the precarious 4 byte
limit earlier allowed by the netlink data preceding the packet.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-16 08:14:52 -08:00
+								    }
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_packet_delete_batch(packets_, may_steal);
-												datapath: Move Netlink PID for userspace actions from flows to actions.

Commit b063d9f06 "datapath: Use unicast Netlink sockets for upcalls" that
switched from multicast to unicast Netlink for sending upcalls added a
Netlink PID to each kernel flow, used by OVS_ACTION_ATTR_USERSPACE actions
within the flow as target.

This commit drops this per-flow PID in favor of a per-action PID, because
that is more flexible.  It does not yet make use of this additional
flexibility, so behavior should not change.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Bug #7559.

											
										
										
											2011-10-12 16:24:54 -07:00
+								}
-												datapath: Refactor actions in terms of match fields.

Almost all current actions can be expressed in the form of
push/pop/set <field>, where field is one of the match fields. We can
create three base actions and take a field. This has both a nice
symmetry and avoids inconsistencies where we can match on the vlan
TPID but not set it.
Following patch converts all actions to this new format.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

Bug #7115

											
										
										
											2011-10-21 14:38:54 -07:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                          struct dp_packet_batch *packets,
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								                          bool may_steal, const struct flow *flow,
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                          const struct nlattr *actions, size_t actions_len,
 								                          long long now)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    struct dp_netdev_execute_aux aux = { pmd, now, flow };
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    odp_execute_actions(&aux, packets, may_steal, actions,
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								                        actions_len, dp_execute_cb);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								struct dp_netdev_ct_dump {
 								    struct ct_dpif_dump_state up;
 								    struct conntrack_dump dump;
 								    struct conntrack *ct;
 								    struct dp_netdev *dp;
 								};
 								static int
 								dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
 								                          const uint16_t *pzone)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_ct_dump *dump;
 								    dump = xzalloc(sizeof *dump);
 								    dump->dp = dp;
 								    dump->ct = &dp->conntrack;
 								    conntrack_dump_start(&dp->conntrack, &dump->dump, pzone);
 								    *dump_ = &dump->up;
 								    return 0;
 								}
 								static int
 								dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
 								                         struct ct_dpif_dump_state *dump_,
 								                         struct ct_dpif_entry *entry)
 								{
 								    struct dp_netdev_ct_dump *dump;
 								    INIT_CONTAINER(dump, dump_, up);
 								    return conntrack_dump_next(&dump->dump, entry);
 								}
 								static int
 								dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 								                         struct ct_dpif_dump_state *dump_)
 								{
 								    struct dp_netdev_ct_dump *dump;
 								    int err;
 								    INIT_CONTAINER(dump, dump_, up);
 								    err = conntrack_dump_done(&dump->dump);
 								    free(dump);
 								    return err;
 								}
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								static int
 								dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    return conntrack_flush(&dp->conntrack, zone);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								const struct dpif_class dpif_netdev_class = {
 								    "netdev",
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    dpif_netdev_init,
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    dpif_netdev_enumerate,
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								    dpif_netdev_port_open_type,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_open,
 								    dpif_netdev_close,
-												Fix some regressions from the merge from master.

											
										
										
											2010-02-08 13:22:41 -05:00
+								    dpif_netdev_destroy,
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    dpif_netdev_run,
 								    dpif_netdev_wait,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_get_stats,
 								    dpif_netdev_port_add,
 								    dpif_netdev_port_del,
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    dpif_netdev_port_set_config,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_port_query_by_number,
 								    dpif_netdev_port_query_by_name,
-												datapath: Move Netlink PID for userspace actions from flows to actions.

Commit b063d9f06 "datapath: Use unicast Netlink sockets for upcalls" that
switched from multicast to unicast Netlink for sending upcalls added a
Netlink PID to each kernel flow, used by OVS_ACTION_ATTR_USERSPACE actions
within the flow as target.

This commit drops this per-flow PID in favor of a per-action PID, because
that is more flexible.  It does not yet make use of this additional
flexibility, so behavior should not change.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Bug #7559.

											
										
										
											2011-10-12 16:24:54 -07:00
+								    NULL,                       /* port_get_pid */
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    dpif_netdev_port_dump_start,
 								    dpif_netdev_port_dump_next,
 								    dpif_netdev_port_dump_done,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_port_poll,
 								    dpif_netdev_port_poll_wait,
 								    dpif_netdev_flow_flush,
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    dpif_netdev_flow_dump_create,
 								    dpif_netdev_flow_dump_destroy,
 								    dpif_netdev_flow_dump_thread_create,
 								    dpif_netdev_flow_dump_thread_destroy,
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								    dpif_netdev_flow_dump_next,
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								    dpif_netdev_operate,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    NULL,                       /* recv_set */
 								    NULL,                       /* handlers_set */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    dpif_netdev_pmd_set,
-												dpif-netdev: Allow enqueue actions.

The dpif-netdev implementation disallowed enqueue actions because
it did not support conversion from OVS 'queue_id' to dpif
'priority'.  For testing purposes, this patch allows queues which
translate into NOOPs.

											
										
										
											2011-11-21 13:36:17 -08:00
+								    dpif_netdev_queue_to_priority,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    NULL,                       /* recv */
 								    NULL,                       /* recv_wait */
 								    NULL,                       /* recv_purge */
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    dpif_netdev_register_dp_purge_cb,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dpif_netdev_register_upcall_cb,
 								    dpif_netdev_enable_upcall,
 								    dpif_netdev_disable_upcall,
-												bridge: Store datapath version into ovsdb

OVS userspace are backward compatible with older Linux kernel modules.
However, not having the most up-to-date datapath kernel modules can
some times lead to user confusion. Storing the datapath version in
OVSDB allows management software to check and optionally provide
notifications to users.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-16 15:23:11 -07:00
+								    dpif_netdev_get_datapath_version,
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    dpif_netdev_ct_dump_start,
 								    dpif_netdev_ct_dump_next,
 								    dpif_netdev_ct_dump_done,
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    dpif_netdev_ct_flush,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								static void
 								dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
 								                              const char *argv[], void *aux OVS_UNUSED)
 								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct dp_netdev_port *port;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    struct dp_netdev *dp;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    odp_port_t port_no;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    dp = shash_find_data(&dp_netdevs, argv[1]);
 								    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        ovs_mutex_unlock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
 								        return;
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    ovs_refcount_ref(&dp->ref_cnt);
 								    ovs_mutex_unlock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    if (get_port_by_name(dp, argv[2], &port)) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "unknown port");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    port_no = u32_to_odp(atoi(argv[3]));
 								    if (!port_no || port_no == ODPP_NONE) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "bad port number");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    if (dp_netdev_lookup_port(dp, port_no)) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "port number already in use");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    /* Remove port. */
 								    hmap_remove(&dp->ports, &port->node);
 								    dp_netdev_del_port_from_all_pmds(dp, port);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    /* Reinsert with new port number. */
 								    port->port_no = port_no;
 								    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
 								    dp_netdev_add_port_to_pmds(dp, port);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_change(dp->port_seq);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    unixctl_command_reply(conn, NULL);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
 								exit:
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    dp_netdev_unref(dp);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								}
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								static void
 								dpif_dummy_register__(const char *type)
 								{
 								    struct dpif_class *class;
 								    class = xmalloc(sizeof *class);
 								    *class = dpif_netdev_class;
 								    class->type = xstrdup(type);
 								    dp_register_provider(class);
 								}
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								static void
 								dpif_dummy_override(const char *type)
 								{
-												dpif_dummy_override: Allow overriding a non-existing provider

This allows --enable-dummy=system with a userland-only build.
It's useful for testsuite.

Signed-off-by: YAMAMOTO Takashi <yamamoto@midokura.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-10-14 17:39:40 +00:00
+								    int error;
 								    /*
 								     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
 								     * a userland-only build.  It's useful for testsuite.
 								     */
 								    error = dp_unregister_provider(type);
 								    if (error == 0 || error == EAFNOSUPPORT) {
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								        dpif_dummy_register__(type);
 								    }
 								}
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								void
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								dpif_dummy_register(enum dummy_level level)
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								{
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								    if (level == DUMMY_OVERRIDE_ALL) {
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								        struct sset types;
 								        const char *type;
 								        sset_init(&types);
 								        dp_enumerate_types(&types);
 								        SSET_FOR_EACH (type, &types) {
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								            dpif_dummy_override(type);
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								        }
 								        sset_destroy(&types);
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
 								        dpif_dummy_override("system");
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								    }
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
 								    dpif_dummy_register__("dummy");
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
 								    unixctl_command_register("dpif-dummy/change-port-number",
-												unixctl: Make command description all lowercase.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 16:27:22 -07:00
+								                             "dp port new-number",
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+, 3, dpif_dummy_change_port_number, NULL);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Datapath Classifier. */
 								/* A set of rules that all have the same fields wildcarded. */
 								struct dpcls_subtable {
 								    /* The fields are only used by writers. */
 								    struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
 								    /* These fields are accessed by readers. */
 								    struct cmap rules;           /* Contains "struct dpcls_rule"s. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    uint32_t hit_cnt;            /* Number of match hits in subtable in current
 								                                    optimization interval. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct netdev_flow_key mask; /* Wildcards for fields (const). */
 								    /* 'mask' must be the last field, additional space is allocated here. */
 								};
 								/* Initializes 'cls' as a classifier that initially contains no classification
 								 * rules. */
 								static void
 								dpcls_init(struct dpcls *cls)
 								{
 								    cmap_init(&cls->subtables_map);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_init(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								static void
 								dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_remove(&cls->subtables, subtable);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
 								                subtable->mask.hash);
 								    cmap_destroy(&subtable->rules);
 								    ovsrcu_postpone(free, subtable);
 								}
 								/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
 								 * caller's responsibility.
 								 * May only be called after all the readers have been terminated. */
 								static void
 								dpcls_destroy(struct dpcls *cls)
 								{
 								    if (cls) {
 								        struct dpcls_subtable *subtable;
 								        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								            ovs_assert(cmap_count(&subtable->rules) == 0);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            dpcls_destroy_subtable(cls, subtable);
 								        }
 								        cmap_destroy(&cls->subtables_map);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								        pvector_destroy(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								}
 								static struct dpcls_subtable *
 								dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable;
 								    /* Need to add one. */
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    subtable = xmalloc(sizeof *subtable
 								                       - sizeof subtable->mask.mf + mask->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    cmap_init(&subtable->rules);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    subtable->hit_cnt = 0;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_key_clone(&subtable->mask, mask);
 								    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Add the new subtable at the end of the pvector (with no hits yet) */
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_insert(&cls->subtables, subtable, 0);
-												dpif-netdev: Fix -Wformat warning on 32-bit build.

Use the appropriate format specifier for size_t, otherwise the 32-bit
build fails.

Reported-at: https://travis-ci.org/openvswitch/ovs/jobs/151938383
Fixes: 3453b4d62a98("dpif-netdev: dpcls per in_port with sorted
subtables")
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-08-12 15:38:50 -07:00
+								    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								             cmap_count(&cls->subtables_map), subtable, cls->in_port);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_publish(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    return subtable;
 								}
 								static inline struct dpcls_subtable *
 								dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable;
 								    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
 								                             &cls->subtables_map) {
 								        if (netdev_flow_key_equal(&subtable->mask, mask)) {
 								            return subtable;
 								        }
 								    }
 								    return dpcls_create_subtable(cls, mask);
 								}
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
 								/* Periodically sort the dpcls subtable vectors according to hit counts */
 								static void
 								dpcls_sort_subtable_vector(struct dpcls *cls)
 								{
 								    struct pvector *pvec = &cls->subtables;
 								    struct dpcls_subtable *subtable;
 								    PVECTOR_FOR_EACH (subtable, pvec) {
 								        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
 								        subtable->hit_cnt = 0;
 								    }
 								    pvector_publish(pvec);
 								}
 								static inline void
 								dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)
 								{
 								    struct dpcls *cls;
 								    long long int now = time_msec();
 								    if (now > pmd->next_optimization) {
 								        /* Try to obtain the flow lock to block out revalidator threads.
 								         * If not possible, just try next time. */
 								        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
 								            /* Optimize each classifier */
 								            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
 								                dpcls_sort_subtable_vector(cls);
 								            }
 								            ovs_mutex_unlock(&pmd->flow_mutex);
 								            /* Start new measuring interval */
 								            pmd->next_optimization = now + DPCLS_OPTIMIZATION_INTERVAL;
 								        }
 								    }
 								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								/* Insert 'rule' into 'cls'. */
 								static void
 								dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
 								             const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Refer to subtable's mask, also for later removal. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    rule->mask = &subtable->mask;
 								    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
 								}
 								/* Removes 'rule' from 'cls', also destructing the 'rule'. */
 								static void
 								dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 								{
 								    struct dpcls_subtable *subtable;
 								    ovs_assert(rule->mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Get subtable from reference in rule->mask. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    INIT_CONTAINER(subtable, rule->mask, mask);
 								    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
 								        == 0) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        /* Delete empty subtable. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        dpcls_destroy_subtable(cls, subtable);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								        pvector_publish(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								}
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
 								 * in 'mask' the values in 'key' and 'target' are the same. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static inline bool
 								dpcls_rule_matches_key(const struct dpcls_rule *rule,
 								                       const struct netdev_flow_key *target)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
 								    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
 								        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            return false;
 								        }
 								    }
 								    return true;
 								}
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								/* For each miniflow in 'keys' performs a classifier lookup writing the result
 								 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * NULL it is skipped.
 								 *
 								 * This function is optimized for use in the userspace datapath and therefore
 								 * does not implement a lot of features available in the standard
 								 * classifier_lookup() function.  Specifically, it does not implement
 								 * priorities, instead returning any rule which matches the flow.
 								 *
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								 * Returns true if all miniflows found a corresponding rule. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static bool
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
 								             struct dpcls_rule **rules, const size_t cnt,
 								             int *num_lookups_p)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								{
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								    /* The received 'cnt' miniflows are the search-keys that will be processed
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								     * to find a matching entry into the available subtables.
 								     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
 								    typedef uint32_t map_type;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    struct dpcls_subtable *subtable;
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								    map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
 								    map_type found_map;
 								    uint32_t hashes[MAP_BITS];
 								    const struct cmap_node *nodes[MAP_BITS];
 								    if (cnt != MAP_BITS) {
 								        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								    memset(rules, 0, cnt * sizeof *rules);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    int lookups_match = 0, subtable_pos = 1;
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								    /* The Datapath classifier - aka dpcls - is composed of subtables.
 								     * Subtables are dynamically created as needed when new rules are inserted.
 								     * Each subtable collects rules with matches on a specific subset of packet
 								     * fields as defined by the subtable's mask.  We proceed to process every
 								     * search-key against each subtable, but when a match is found for a
 								     * search-key, the search for that key can stop because the rules are
 								     * non-overlapping. */
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								        int i;
 								        /* Compute hashes for the remaining keys.  Each search-key is
 								         * masked with the subtable's mask to avoid hashing the wildcarded
 								         * bits. */
 								        ULLONG_FOR_EACH_1(i, keys_map) {
 								            hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
 								                                                     &subtable->mask);
 								        }
 								        /* Lookup. */
 								        found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
 								        /* Check results.  When the i-th bit of found_map is set, it means
 								         * that a set of nodes with a matching hash value was found for the
 								         * i-th search-key.  Due to possible hash collisions we need to check
 								         * which of the found rules, if any, really matches our masked
 								         * search-key. */
 								        ULLONG_FOR_EACH_1(i, found_map) {
 								            struct dpcls_rule *rule;
 								            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
 								                if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
 								                    rules[i] = rule;
 								                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
 								                     * within one second optimization interval. */
 								                    subtable->hit_cnt++;
 								                    lookups_match += subtable_pos;
 								                    goto next;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                }
 								            }
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								            /* None of the found rules was a match.  Reset the i-th bit to
 								             * keep searching this key in the next subtable. */
 								            ULLONG_SET0(found_map, i);  /* Did not match. */
 								        next:
 								            ;                     /* Keep Sparse happy. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        }
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								        keys_map &= ~found_map;             /* Clear the found rules. */
 								        if (!keys_map) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								            if (num_lookups_p) {
 								                *num_lookups_p = lookups_match;
 								            }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            return true;              /* All found. */
 								        }
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        subtable_pos++;
 								    }
 								    if (num_lookups_p) {
 								        *num_lookups_p = lookups_match;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								    return false;                     /* Some misses. */
 								}