ovs/lib/dpif-netdev.c

/*
 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "dpif-netdev.h"

#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <net/if.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>

#include "bitmap.h"
#include "cmap.h"
#include "conntrack.h"
#include "coverage.h"
#include "ct-dpif.h"
#include "csum.h"
#include "dp-packet.h"
#include "dpif.h"
#include "dpif-netdev-perf.h"
#include "dpif-provider.h"
#include "dummy.h"
#include "fat-rwlock.h"
#include "flow.h"
#include "hmapx.h"
#include "id-pool.h"
#include "latch.h"
#include "netdev.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "netlink.h"
#include "odp-execute.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
#include "openvswitch/ofp-parse.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/ofpbuf.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "packets.h"
#include "openvswitch/poll-loop.h"
#include "pvector.h"
#include "random.h"
#include "seq.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "tnl-neigh-cache.h"
#include "tnl-ports.h"
#include "unixctl.h"
#include "util.h"
#include "uuid.h"

VLOG_DEFINE_THIS_MODULE(dpif_netdev);

/* Auto Load Balancing Defaults */
#define ALB_ACCEPTABLE_IMPROVEMENT       25
#define ALB_PMD_LOAD_THRESHOLD           95
#define ALB_PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
#define MIN_TO_MSEC                  60000

#define FLOW_DUMP_MAX_BATCH 50
/* Use per thread recirc_depth to prevent recirculation loop. */
#define MAX_RECIRC_DEPTH 6
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)

/* Use instant packet send by default. */
#define DEFAULT_TX_FLUSH_INTERVAL 0

/* Configuration parameters. */
enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */

/* Protects against changes to 'dp_netdevs'. */
static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;

/* Contains all 'struct dp_netdev's. */
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
    = SHASH_INITIALIZER(&dp_netdevs);

static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);

#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
                                     | CS_SRC_NAT | CS_DST_NAT)
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)

static struct odp_support dp_netdev_support = {
    .max_vlan_headers = SIZE_MAX,
    .max_mpls_depth = SIZE_MAX,
    .recirc = true,
    .ct_state = true,
    .ct_zone = true,
    .ct_mark = true,
    .ct_label = true,
    .ct_state_nat = true,
    .ct_orig_tuple = true,
    .ct_orig_tuple6 = true,
};

/* Stores a miniflow with inline values */

struct netdev_flow_key {
    uint32_t hash;       /* Hash function differs for different users. */
    uint32_t len;        /* Length of the following miniflow (incl. map). */
    struct miniflow mf;
    uint64_t buf[FLOW_MAX_PACKET_U64S];
};

/* EMC cache and SMC cache compose the datapath flow cache (DFC)
 *
 * Exact match cache for frequently used flows
 *
 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 * search its entries for a miniflow that matches exactly the miniflow of the
 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 *
 * A cache entry holds a reference to its 'dp_netdev_flow'.
 *
 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 * value is the index of a cache entry where the miniflow could be.
 *
 *
 * Signature match cache (SMC)
 *
 * This cache stores a 16-bit signature for each flow without storing keys, and
 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 * SMC uses a set-associative design that each bucket contains
 * SMC_ENTRY_PER_BUCKET number of entries.
 * Since 16-bit flow_table index is used, if there are more than 2^16
 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 *
 *
 * Thread-safety
 * =============
 *
 * Each pmd_thread has its own private exact match cache.
 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 */

#define EM_FLOW_HASH_SHIFT 13
#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
#define EM_FLOW_HASH_SEGS 2

/* SMC uses a set-associative design. A bucket contains a set of entries that
 * a flow item can occupy. For now, it uses one hash function rather than two
 * as for the EMC design. */
#define SMC_ENTRY_PER_BUCKET 4
#define SMC_ENTRIES (1u << 20)
#define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
#define SMC_MASK (SMC_BUCKET_CNT - 1)

/* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
#define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
                                    DEFAULT_EM_FLOW_INSERT_INV_PROB)

struct emc_entry {
    struct dp_netdev_flow *flow;
    struct netdev_flow_key key;   /* key.hash used for emc hash value. */
};

struct emc_cache {
    struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
    int sweep_idx;                /* For emc_cache_slow_sweep(). */
};

struct smc_bucket {
    uint16_t sig[SMC_ENTRY_PER_BUCKET];
    uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
};

/* Signature match cache, differentiate from EMC cache */
struct smc_cache {
    struct smc_bucket buckets[SMC_BUCKET_CNT];
};

struct dfc_cache {
    struct emc_cache emc_cache;
    struct smc_cache smc_cache;
};

/* Iterate in the exact match cache through every entry that might contain a
 * miniflow with hash 'HASH'. */
#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
    for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
         (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
         i__ < EM_FLOW_HASH_SEGS;                                            \
         i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)

/* Simple non-wildcarding single-priority classifier. */

/* Time in microseconds between successive optimizations of the dpcls
 * subtable vector */
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL

/* Time in microseconds of the interval in which rxq processing cycles used
 * in rxq to pmd assignments is measured and stored. */
#define PMD_RXQ_INTERVAL_LEN 10000000LL

/* Number of intervals for which cycles are stored
 * and used during rxq to pmd assignment. */
#define PMD_RXQ_INTERVAL_MAX 6

struct dpcls {
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
    odp_port_t in_port;
    struct cmap subtables_map;
    struct pvector subtables;
};

/* A rule to be inserted to the classifier. */
struct dpcls_rule {
    struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
    struct netdev_flow_key *mask; /* Subtable's mask. */
    struct netdev_flow_key flow;  /* Matching key. */
    /* 'flow' must be the last field, additional space is allocated here. */
};

/* Data structure to keep packet order till fastpath processing. */
struct dp_packet_flow_map {
    struct dp_packet *packet;
    struct dp_netdev_flow *flow;
    uint16_t tcp_flags;
};

static void dpcls_init(struct dpcls *);
static void dpcls_destroy(struct dpcls *);
static void dpcls_sort_subtable_vector(struct dpcls *);
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
                         const struct netdev_flow_key *mask);
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
static bool dpcls_lookup(struct dpcls *cls,
                         const struct netdev_flow_key *keys[],
                         struct dpcls_rule **rules, size_t cnt,
                         int *num_lookups_p);
static bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
                            const struct netdev_flow_key *target);
/* Set of supported meter flags */
#define DP_SUPPORTED_METER_FLAGS_MASK \
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)

/* Set of supported meter band types */
#define DP_SUPPORTED_METER_BAND_TYPES           \
    ( 1 << OFPMBT13_DROP )

struct dp_meter_band {
    struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
    uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
    uint64_t packet_count;
    uint64_t byte_count;
};

struct dp_meter {
    uint16_t flags;
    uint16_t n_bands;
    uint32_t max_delta_t;
    uint64_t used;
    uint64_t packet_count;
    uint64_t byte_count;
    struct dp_meter_band bands[];
};

struct pmd_auto_lb {
    bool auto_lb_requested;     /* Auto load balancing requested by user. */
    bool is_enabled;            /* Current status of Auto load balancing. */
    uint64_t rebalance_intvl;
    uint64_t rebalance_poll_timer;
};

/* Datapath based on the network device interface from netdev.h.
 *
 *
 * Thread-safety
 * =============
 *
 * Some members, marked 'const', are immutable.  Accessing other members
 * requires synchronization, as noted in more detail below.
 *
 * Acquisition order is, from outermost to innermost:
 *
 *    dp_netdev_mutex (global)
 *    port_mutex
 *    non_pmd_mutex
 */
struct dp_netdev {
    const struct dpif_class *const class;
    const char *const name;
    struct dpif *dpif;
    struct ovs_refcount ref_cnt;
    atomic_flag destroyed;

    /* Ports.
     *
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
     * through 'ports' requires taking 'port_mutex'. */
    struct ovs_mutex port_mutex;
    struct hmap ports;
    struct seq *port_seq;       /* Incremented whenever a port changes. */

    /* The time that a packet can wait in output batch for sending. */
    atomic_uint32_t tx_flush_interval;

    /* Meters. */
    struct ovs_mutex meter_locks[N_METER_LOCKS];
    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */

    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
    /* Enable collection of PMD performance metrics. */
    atomic_bool pmd_perf_metrics;
    /* Enable the SMC cache from ovsdb config */
    atomic_bool smc_enable_db;

    /* Protects access to ofproto-dpif-upcall interface during revalidator
     * thread synchronization. */
    struct fat_rwlock upcall_rwlock;
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
    void *upcall_aux;

    /* Callback function for notifying the purging of dp flows (during
     * reseting pmd deletion). */
    dp_purge_callback *dp_purge_cb;
    void *dp_purge_aux;

    /* Stores all 'struct dp_netdev_pmd_thread's. */
    struct cmap poll_threads;
    /* id pool for per thread static_tx_qid. */
    struct id_pool *tx_qid_pool;
    struct ovs_mutex tx_qid_pool_mutex;
    /* Use measured cycles for rxq to pmd assignment. */
    bool pmd_rxq_assign_cyc;

    /* Protects the access of the 'struct dp_netdev_pmd_thread'
     * instance for non-pmd thread. */
    struct ovs_mutex non_pmd_mutex;

    /* Each pmd thread will store its pointer to
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
    ovsthread_key_t per_pmd_key;

    struct seq *reconfigure_seq;
    uint64_t last_reconfigure_seq;

    /* Cpu mask for pin of pmd threads. */
    char *pmd_cmask;

    uint64_t last_tnl_conf_seq;

    struct conntrack conntrack;
    struct pmd_auto_lb pmd_alb;
};

static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
{
    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
}

static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
{
    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
}


static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
                                                    odp_port_t)
    OVS_REQUIRES(dp->port_mutex);

enum rxq_cycles_counter_type {
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
                                   processing packets during the current
                                   interval. */
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
                                   during rxq to pmd assignment. */
    RXQ_N_CYCLES
};

enum {
    DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
    DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
    DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
};

struct dp_flow_offload_item {
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_flow *flow;
    int op;
    struct match match;
    struct nlattr *actions;
    size_t actions_len;

    struct ovs_list node;
};

struct dp_flow_offload {
    struct ovs_mutex mutex;
    struct ovs_list list;
    pthread_cond_t cond;
};

static struct dp_flow_offload dp_flow_offload = {
    .mutex = OVS_MUTEX_INITIALIZER,
    .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
};

static struct ovsthread_once offload_thread_once
    = OVSTHREAD_ONCE_INITIALIZER;

#define XPS_TIMEOUT 500000LL    /* In microseconds. */

/* Contained by struct dp_netdev_port's 'rxqs' member.  */
struct dp_netdev_rxq {
    struct dp_netdev_port *port;
    struct netdev_rxq *rx;
    unsigned core_id;                  /* Core to which this queue should be
                                          pinned. OVS_CORE_UNSPEC if the
                                          queue doesn't need to be pinned to a
                                          particular core. */
    unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
    bool is_vhost;                     /* Is rxq of a vhost port. */

    /* Counters of cycles spent successfully polling and processing pkts. */
    atomic_ullong cycles[RXQ_N_CYCLES];
    /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
       sum them to yield the cycles used for an rxq. */
    atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
};

/* A port in a netdev-based datapath. */
struct dp_netdev_port {
    odp_port_t port_no;
    bool dynamic_txqs;          /* If true XPS will be used. */
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
    struct netdev *netdev;
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
    struct netdev_saved_flags *sf;
    struct dp_netdev_rxq *rxqs;
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
    struct ovs_mutex txq_used_mutex;
    bool emc_enabled;           /* If true EMC will be used. */
    char *type;                 /* Port type as requested by user. */
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
};

/* Contained by struct dp_netdev_flow's 'stats' member.  */
struct dp_netdev_flow_stats {
    atomic_llong used;             /* Last used time, in monotonic msecs. */
    atomic_ullong packet_count;    /* Number of packets matched. */
    atomic_ullong byte_count;      /* Number of bytes matched. */
    atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
};

/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 *
 *
 * Thread-safety
 * =============
 *
 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 *
 * Motivation
 * ----------
 *
 * The thread safety rules described here for "struct dp_netdev_flow" are
 * motivated by two goals:
 *
 *    - Prevent threads that read members of "struct dp_netdev_flow" from
 *      reading bad data due to changes by some thread concurrently modifying
 *      those members.
 *
 *    - Prevent two threads making changes to members of a given "struct
 *      dp_netdev_flow" from interfering with each other.
 *
 *
 * Rules
 * -----
 *
 * A flow 'flow' may be accessed without a risk of being freed during an RCU
 * grace period.  Code that needs to hold onto a flow for a while
 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 *
 * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 * from modification.
 *
 * Some members, marked 'const', are immutable.  Accessing other members
 * requires synchronization, as noted in more detail below.
 */
struct dp_netdev_flow {
    const struct flow flow;      /* Unmasked flow that created this entry. */
    /* Hash table index by unmasked flow. */
    const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
                                 /* 'flow_table'. */
    const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
    const ovs_u128 ufid;         /* Unique flow identifier. */
    const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
    const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
                                 /* flow. */

    /* Number of references.
     * The classifier owns one reference.
     * Any thread trying to keep a rule from being freed should hold its own
     * reference. */
    struct ovs_refcount ref_cnt;

    bool dead;
    uint32_t mark;               /* Unique flow mark assigned to a flow */

    /* Statistics. */
    struct dp_netdev_flow_stats stats;

    /* Actions. */
    OVSRCU_TYPE(struct dp_netdev_actions *) actions;

    /* While processing a group of input packets, the datapath uses the next
     * member to store a pointer to the output batch for the flow.  It is
     * reset after the batch has been sent out (See dp_netdev_queue_batches(),
     * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
    struct packet_batch_per_flow *batch;

    /* Packet classification. */
    struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
    /* 'cr' must be the last member. */
};

static void dp_netdev_flow_unref(struct dp_netdev_flow *);
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
                                         struct flow *, bool);

/* A set of datapath actions within a "struct dp_netdev_flow".
 *
 *
 * Thread-safety
 * =============
 *
 * A struct dp_netdev_actions 'actions' is protected with RCU. */
struct dp_netdev_actions {
    /* These members are immutable: they do not change during the struct's
     * lifetime.  */
    unsigned int size;          /* Size of 'actions', in bytes. */
    struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
};

struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
                                                   size_t);
struct dp_netdev_actions *dp_netdev_flow_get_actions(
    const struct dp_netdev_flow *);
static void dp_netdev_actions_free(struct dp_netdev_actions *);

struct polled_queue {
    struct dp_netdev_rxq *rxq;
    odp_port_t port_no;
    bool emc_enabled;
};

/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
struct rxq_poll {
    struct dp_netdev_rxq *rxq;
    struct hmap_node node;
};

/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 * 'tnl_port_cache' or 'tx_ports'. */
struct tx_port {
    struct dp_netdev_port *port;
    int qid;
    long long last_used;
    struct hmap_node node;
    long long flush_time;
    struct dp_packet_batch output_pkts;
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
};

/* A set of properties for the current processing loop that is not directly
 * associated with the pmd thread itself, but with the packets being
 * processed or the short-term system configuration (for example, time).
 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
struct dp_netdev_pmd_thread_ctx {
    /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
    long long now;
    /* RX queue from which last packet was received. */
    struct dp_netdev_rxq *last_rxq;
    /* EMC insertion probability context for the current processing cycle. */
    uint32_t emc_insert_min;
};

/* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 * the performance overhead of interrupt processing.  Therefore netdev can
 * not implement rx-wait for these devices.  dpif-netdev needs to poll
 * these device to check for recv buffer.  pmd-thread does polling for
 * devices assigned to itself.
 *
 * DPDK used PMD for accessing NIC.
 *
 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 * I/O of all non-pmd threads.  There will be no actual thread created
 * for the instance.
 *
 * Each struct has its own flow cache and classifier per managed ingress port.
 * For packets received on ingress port, a look up is done on corresponding PMD
 * thread's flow cache and in case of a miss, lookup is performed in the
 * corresponding classifier of port.  Packets are executed with the found
 * actions in either case.
 * */
struct dp_netdev_pmd_thread {
    struct dp_netdev *dp;
    struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
    struct cmap_node node;          /* In 'dp->poll_threads'. */

    pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
    struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */

    /* Per thread exact-match cache.  Note, the instance for cpu core
     * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
     * need to be protected by 'non_pmd_mutex'.  Every other instance
     * will only be accessed by its own pmd thread. */
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;

    /* Flow-Table and classifiers
     *
     * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
     * changes to 'classifiers' must be made while still holding the
     * 'flow_mutex'.
     */
    struct ovs_mutex flow_mutex;
    struct cmap flow_table OVS_GUARDED; /* Flow table. */

    /* One classifier per in_port polled by the pmd */
    struct cmap classifiers;
    /* Periodically sort subtable vectors according to hit frequencies */
    long long int next_optimization;
    /* End of the next time interval for which processing cycles
       are stored for each polled rxq. */
    long long int rxq_next_cycle_store;

    /* Last interval timestamp. */
    uint64_t intrvl_tsc_prev;
    /* Last interval cycles. */
    atomic_ullong intrvl_cycles;

    /* Current context of the PMD thread. */
    struct dp_netdev_pmd_thread_ctx ctx;

    struct latch exit_latch;        /* For terminating the pmd thread. */
    struct seq *reload_seq;
    uint64_t last_reload_seq;
    atomic_bool reload;             /* Do we need to reload ports? */
    pthread_t thread;
    unsigned core_id;               /* CPU core id of this pmd thread. */
    int numa_id;                    /* numa node id of this pmd thread. */
    bool isolated;

    /* Queue id used by this pmd thread to send packets on all netdevs if
     * XPS disabled for this netdev. All static_tx_qid's are unique and less
     * than 'cmap_count(dp->poll_threads)'. */
    uint32_t static_tx_qid;

    /* Number of filled output batches. */
    int n_output_batches;

    struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
    /* List of rx queues to poll. */
    struct hmap poll_list OVS_GUARDED;
    /* Map of 'tx_port's used for transmission.  Written by the main thread,
     * read by the pmd thread. */
    struct hmap tx_ports OVS_GUARDED;

    /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
     * ports (that support push_tunnel/pop_tunnel), the other contains ports
     * with at least one txq (that support send).  A port can be in both.
     *
     * There are two separate maps to make sure that we don't try to execute
     * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
     *
     * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
     * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
     * other instance will only be accessed by its own pmd thread. */
    struct hmap tnl_port_cache;
    struct hmap send_port_cache;

    /* Keep track of detailed PMD performance statistics. */
    struct pmd_perf_stats perf_stats;

    /* Stats from previous iteration used by automatic pmd
     * load balance logic. */
    uint64_t prev_stats[PMD_N_STATS];
    atomic_count pmd_overloaded;

    /* Set to true if the pmd thread needs to be reloaded. */
    bool need_reload;
};

/* Interface to netdev-based datapath. */
struct dpif_netdev {
    struct dpif dpif;
    struct dp_netdev *dp;
    uint64_t last_port_seq;
};

static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
                              struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex);
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
                            struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex);
static void dp_netdev_free(struct dp_netdev *)
    OVS_REQUIRES(dp_netdev_mutex);
static int do_add_port(struct dp_netdev *dp, const char *devname,
                       const char *type, odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex);
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
    OVS_REQUIRES(dp->port_mutex);
static int dpif_netdev_open(const struct dpif_class *, const char *name,
                            bool create, struct dpif **);
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                      struct dp_packet_batch *,
                                      bool should_steal,
                                      const struct flow *flow,
                                      const struct nlattr *actions,
                                      size_t actions_len);
static void dp_netdev_input(struct dp_netdev_pmd_thread *,
                            struct dp_packet_batch *, odp_port_t port_no);
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                  struct dp_packet_batch *);

static void dp_netdev_disable_upcall(struct dp_netdev *);
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
                                    struct dp_netdev *dp, unsigned core_id,
                                    int numa_id);
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex);

static void *pmd_thread_main(void *);
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
                                                      unsigned core_id);
static struct dp_netdev_pmd_thread *
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
static void dp_netdev_del_pmd(struct dp_netdev *dp,
                              struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
                                         struct dp_netdev_port *port)
    OVS_REQUIRES(pmd->port_mutex);
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
                                           struct tx_port *tx)
    OVS_REQUIRES(pmd->port_mutex);
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
                                     struct dp_netdev_rxq *rxq)
    OVS_REQUIRES(pmd->port_mutex);
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
                                       struct rxq_poll *poll)
    OVS_REQUIRES(pmd->port_mutex);
static int
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
                                   bool force);

static void reconfigure_datapath(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex);
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
    OVS_REQUIRES(pmd->port_mutex);
static inline void
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
                           struct polled_queue *poll_list, int poll_cnt);
static void
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
                         enum rxq_cycles_counter_type type,
                         unsigned long long cycles);
static uint64_t
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
                         enum rxq_cycles_counter_type type);
static void
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
                           unsigned long long cycles);
static uint64_t
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
static void
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                               bool purge);
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
                                      struct tx_port *tx);

static inline bool emc_entry_alive(struct emc_entry *ce);
static void emc_clear_entry(struct emc_entry *ce);
static void smc_clear_entry(struct smc_bucket *b, int idx);

static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
static inline bool
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
                                  struct dp_netdev_flow *flow);

static void
emc_cache_init(struct emc_cache *flow_cache)
{
    int i;

    flow_cache->sweep_idx = 0;
    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
        flow_cache->entries[i].flow = NULL;
        flow_cache->entries[i].key.hash = 0;
        flow_cache->entries[i].key.len = sizeof(struct miniflow);
        flowmap_init(&flow_cache->entries[i].key.mf.map);
    }
}

static void
smc_cache_init(struct smc_cache *smc_cache)
{
    int i, j;
    for (i = 0; i < SMC_BUCKET_CNT; i++) {
        for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
            smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
        }
    }
}

static void
dfc_cache_init(struct dfc_cache *flow_cache)
{
    emc_cache_init(&flow_cache->emc_cache);
    smc_cache_init(&flow_cache->smc_cache);
}

static void
emc_cache_uninit(struct emc_cache *flow_cache)
{
    int i;

    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
        emc_clear_entry(&flow_cache->entries[i]);
    }
}

static void
smc_cache_uninit(struct smc_cache *smc)
{
    int i, j;

    for (i = 0; i < SMC_BUCKET_CNT; i++) {
        for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
            smc_clear_entry(&(smc->buckets[i]), j);
        }
    }
}

static void
dfc_cache_uninit(struct dfc_cache *flow_cache)
{
    smc_cache_uninit(&flow_cache->smc_cache);
    emc_cache_uninit(&flow_cache->emc_cache);
}

/* Check and clear dead flow references slowly (one entry at each
 * invocation).  */
static void
emc_cache_slow_sweep(struct emc_cache *flow_cache)
{
    struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];

    if (!emc_entry_alive(entry)) {
        emc_clear_entry(entry);
    }
    flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
}

/* Updates the time in PMD threads context and should be called in three cases:
 *
 *     1. PMD structure initialization:
 *         - dp_netdev_configure_pmd()
 *
 *     2. Before processing of the new packet batch:
 *         - dpif_netdev_execute()
 *         - dp_netdev_process_rxq_port()
 *
 *     3. At least once per polling iteration in main polling threads if no
 *        packets received on current iteration:
 *         - dpif_netdev_run()
 *         - pmd_thread_main()
 *
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
 */
static inline void
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
{
    pmd->ctx.now = time_usec();
}

/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
bool
dpif_is_netdev(const struct dpif *dpif)
{
    return dpif->dpif_class->open == dpif_netdev_open;
}

static struct dpif_netdev *
dpif_netdev_cast(const struct dpif *dpif)
{
    ovs_assert(dpif_is_netdev(dpif));
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
}

static struct dp_netdev *
get_dp_netdev(const struct dpif *dpif)
{
    return dpif_netdev_cast(dpif)->dp;
}

enum pmd_info_type {
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
};

static void
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
{
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
                        ? "main thread" : "pmd thread");
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
    }
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
        ds_put_format(reply, " core_id %u", pmd->core_id);
    }
    ds_put_cstr(reply, ":\n");
}

static void
pmd_info_show_stats(struct ds *reply,
                    struct dp_netdev_pmd_thread *pmd)
{
    uint64_t stats[PMD_N_STATS];
    uint64_t total_cycles, total_packets;
    double passes_per_pkt = 0;
    double lookups_per_hit = 0;
    double packets_per_batch = 0;

    pmd_perf_read_counters(&pmd->perf_stats, stats);
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
                         + stats[PMD_CYCLES_ITER_BUSY];
    total_packets = stats[PMD_STAT_RECV];

    format_pmd_thread(reply, pmd);

    if (total_packets > 0) {
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
                            / (double) total_packets;
    }
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
                            / (double) stats[PMD_STAT_MASKED_HIT];
    }
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
                            / (double) stats[PMD_STAT_SENT_BATCHES];
    }

    ds_put_format(reply,
                  "  packets received: %"PRIu64"\n"
                  "  packet recirculations: %"PRIu64"\n"
                  "  avg. datapath passes per packet: %.02f\n"
                  "  emc hits: %"PRIu64"\n"
                  "  smc hits: %"PRIu64"\n"
                  "  megaflow hits: %"PRIu64"\n"
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
                  "  miss with success upcall: %"PRIu64"\n"
                  "  miss with failed upcall: %"PRIu64"\n"
                  "  avg. packets per output batch: %.02f\n",
                  total_packets, stats[PMD_STAT_RECIRC],
                  passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
                  stats[PMD_STAT_SMC_HIT],
                  stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
                  stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
                  packets_per_batch);

    if (total_cycles == 0) {
        return;
    }

    ds_put_format(reply,
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
                  stats[PMD_CYCLES_ITER_IDLE],
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
                  stats[PMD_CYCLES_ITER_BUSY],
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);

    if (total_packets == 0) {
        return;
    }

    ds_put_format(reply,
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
                  total_cycles / (double) total_packets,
                  total_cycles, total_packets);

    ds_put_format(reply,
                  "  avg processing cycles per packet: "
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
}

static void
pmd_info_show_perf(struct ds *reply,
                   struct dp_netdev_pmd_thread *pmd,
                   struct pmd_perf_params *par)
{
    if (pmd->core_id != NON_PMD_CORE_ID) {
        char *time_str =
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
        long long now = time_msec();
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;

        ds_put_cstr(reply, "\n");
        ds_put_format(reply, "Time: %s\n", time_str);
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
        ds_put_cstr(reply, "\n");
        format_pmd_thread(reply, pmd);
        ds_put_cstr(reply, "\n");
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
        if (pmd_perf_metrics_enabled(pmd)) {
            /* Prevent parallel clearing of perf metrics. */
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
            if (par->histograms) {
                ds_put_cstr(reply, "\n");
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
            }
            if (par->iter_hist_len > 0) {
                ds_put_cstr(reply, "\n");
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
                        par->iter_hist_len);
            }
            if (par->ms_hist_len > 0) {
                ds_put_cstr(reply, "\n");
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
                        par->ms_hist_len);
            }
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
        }
        free(time_str);
    }
}

static int
compare_poll_list(const void *a_, const void *b_)
{
    const struct rxq_poll *a = a_;
    const struct rxq_poll *b = b_;

    const char *namea = netdev_rxq_get_name(a->rxq->rx);
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);

    int cmp = strcmp(namea, nameb);
    if (!cmp) {
        return netdev_rxq_get_queue_id(a->rxq->rx)
               - netdev_rxq_get_queue_id(b->rxq->rx);
    } else {
        return cmp;
    }
}

static void
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
                 size_t *n)
{
    struct rxq_poll *ret, *poll;
    size_t i;

    *n = hmap_count(&pmd->poll_list);
    if (!*n) {
        ret = NULL;
    } else {
        ret = xcalloc(*n, sizeof *ret);
        i = 0;
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
            ret[i] = *poll;
            i++;
        }
        ovs_assert(i == *n);
        qsort(ret, *n, sizeof *ret, compare_poll_list);
    }

    *list = ret;
}

static void
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
{
    if (pmd->core_id != NON_PMD_CORE_ID) {
        struct rxq_poll *list;
        size_t n_rxq;
        uint64_t total_cycles = 0;

        ds_put_format(reply,
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
                                                  ? "true" : "false");

        ovs_mutex_lock(&pmd->port_mutex);
        sorted_poll_list(pmd, &list, &n_rxq);

        /* Get the total pmd cycles for an interval. */
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
        /* Estimate the cycles to cover all intervals. */
        total_cycles *= PMD_RXQ_INTERVAL_MAX;

        for (int i = 0; i < n_rxq; i++) {
            struct dp_netdev_rxq *rxq = list[i].rxq;
            const char *name = netdev_rxq_get_name(rxq->rx);
            uint64_t proc_cycles = 0;

            for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
            }
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
            ds_put_format(reply, "  pmd usage: ");
            if (total_cycles) {
                ds_put_format(reply, "%2"PRIu64"",
                              proc_cycles * 100 / total_cycles);
                ds_put_cstr(reply, " %");
            } else {
                ds_put_format(reply, "%s", "NOT AVAIL");
            }
            ds_put_cstr(reply, "\n");
        }
        ovs_mutex_unlock(&pmd->port_mutex);
        free(list);
    }
}

static int
compare_poll_thread_list(const void *a_, const void *b_)
{
    const struct dp_netdev_pmd_thread *a, *b;

    a = *(struct dp_netdev_pmd_thread **)a_;
    b = *(struct dp_netdev_pmd_thread **)b_;

    if (a->core_id < b->core_id) {
        return -1;
    }
    if (a->core_id > b->core_id) {
        return 1;
    }
    return 0;
}

/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
 * this list, as long as we do not go to quiescent state. */
static void
sorted_poll_thread_list(struct dp_netdev *dp,
                        struct dp_netdev_pmd_thread ***list,
                        size_t *n)
{
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_pmd_thread **pmd_list;
    size_t k = 0, n_pmds;

    n_pmds = cmap_count(&dp->poll_threads);
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (k >= n_pmds) {
            break;
        }
        pmd_list[k++] = pmd;
    }

    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);

    *list = pmd_list;
    *n = k;
}

static void
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
                          const char *argv[], void *aux OVS_UNUSED)
{
    struct ds reply = DS_EMPTY_INITIALIZER;
    struct dp_netdev *dp = NULL;

    ovs_mutex_lock(&dp_netdev_mutex);

    if (argc == 2) {
        dp = shash_find_data(&dp_netdevs, argv[1]);
    } else if (shash_count(&dp_netdevs) == 1) {
        /* There's only one datapath */
        dp = shash_first(&dp_netdevs)->data;
    }

    if (!dp) {
        ovs_mutex_unlock(&dp_netdev_mutex);
        unixctl_command_reply_error(conn,
                                    "please specify an existing datapath");
        return;
    }

    dp_netdev_request_reconfigure(dp);
    ovs_mutex_unlock(&dp_netdev_mutex);
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
    unixctl_command_reply(conn, ds_cstr(&reply));
    ds_destroy(&reply);
}

static void
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
                     void *aux)
{
    struct ds reply = DS_EMPTY_INITIALIZER;
    struct dp_netdev_pmd_thread **pmd_list;
    struct dp_netdev *dp = NULL;
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
    unsigned int core_id;
    bool filter_on_pmd = false;
    size_t n;

    ovs_mutex_lock(&dp_netdev_mutex);

    while (argc > 1) {
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
            if (str_to_uint(argv[2], 10, &core_id)) {
                filter_on_pmd = true;
            }
            argc -= 2;
            argv += 2;
        } else {
            dp = shash_find_data(&dp_netdevs, argv[1]);
            argc -= 1;
            argv += 1;
        }
    }

    if (!dp) {
        if (shash_count(&dp_netdevs) == 1) {
            /* There's only one datapath */
            dp = shash_first(&dp_netdevs)->data;
        } else {
            ovs_mutex_unlock(&dp_netdev_mutex);
            unixctl_command_reply_error(conn,
                                        "please specify an existing datapath");
            return;
        }
    }

    sorted_poll_thread_list(dp, &pmd_list, &n);
    for (size_t i = 0; i < n; i++) {
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
        if (!pmd) {
            break;
        }
        if (filter_on_pmd && pmd->core_id != core_id) {
            continue;
        }
        if (type == PMD_INFO_SHOW_RXQ) {
            pmd_info_show_rxq(&reply, pmd);
        } else if (type == PMD_INFO_CLEAR_STATS) {
            pmd_perf_stats_clear(&pmd->perf_stats);
        } else if (type == PMD_INFO_SHOW_STATS) {
            pmd_info_show_stats(&reply, pmd);
        } else if (type == PMD_INFO_PERF_SHOW) {
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
        }
    }
    free(pmd_list);

    ovs_mutex_unlock(&dp_netdev_mutex);

    unixctl_command_reply(conn, ds_cstr(&reply));
    ds_destroy(&reply);
}

static void
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
                          const char *argv[],
                          void *aux OVS_UNUSED)
{
    struct pmd_perf_params par;
    long int it_hist = 0, ms_hist = 0;
    par.histograms = true;

    while (argc > 1) {
        if (!strcmp(argv[1], "-nh")) {
            par.histograms = false;
            argc -= 1;
            argv += 1;
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
            it_hist = strtol(argv[2], NULL, 10);
            if (it_hist < 0) {
                it_hist = 0;
            } else if (it_hist > HISTORY_LEN) {
                it_hist = HISTORY_LEN;
            }
            argc -= 2;
            argv += 2;
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
            ms_hist = strtol(argv[2], NULL, 10);
            if (ms_hist < 0) {
                ms_hist = 0;
            } else if (ms_hist > HISTORY_LEN) {
                ms_hist = HISTORY_LEN;
            }
            argc -= 2;
            argv += 2;
        } else {
            break;
        }
    }
    par.iter_hist_len = it_hist;
    par.ms_hist_len = ms_hist;
    par.command_type = PMD_INFO_PERF_SHOW;
    dpif_netdev_pmd_info(conn, argc, argv, &par);
}

static int
dpif_netdev_init(void)
{
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
                              clear_aux = PMD_INFO_CLEAR_STATS,
                              poll_aux = PMD_INFO_SHOW_RXQ;

    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
                             0, 3, dpif_netdev_pmd_info,
                             (void *)&show_aux);
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
                             0, 3, dpif_netdev_pmd_info,
                             (void *)&clear_aux);
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
                             0, 3, dpif_netdev_pmd_info,
                             (void *)&poll_aux);
    unixctl_command_register("dpif-netdev/pmd-perf-show",
                             "[-nh] [-it iter-history-len]"
                             " [-ms ms-history-len]"
                             " [-pmd core] [dp]",
                             0, 8, pmd_perf_show_cmd,
                             NULL);
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
                             0, 1, dpif_netdev_pmd_rebalance,
                             NULL);
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
                             "on|off [-b before] [-a after] [-e|-ne] "
                             "[-us usec] [-q qlen]",
                             0, 10, pmd_perf_log_set_cmd,
                             NULL);
    return 0;
}

static int
dpif_netdev_enumerate(struct sset *all_dps,
                      const struct dpif_class *dpif_class)
{
    struct shash_node *node;

    ovs_mutex_lock(&dp_netdev_mutex);
    SHASH_FOR_EACH(node, &dp_netdevs) {
        struct dp_netdev *dp = node->data;
        if (dpif_class != dp->class) {
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
             * If the class doesn't match, skip this dpif. */
             continue;
        }
        sset_add(all_dps, node->name);
    }
    ovs_mutex_unlock(&dp_netdev_mutex);

    return 0;
}

static bool
dpif_netdev_class_is_dummy(const struct dpif_class *class)
{
    return class != &dpif_netdev_class;
}

static const char *
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
{
    return strcmp(type, "internal") ? type
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
                  : "tap";
}

static struct dpif *
create_dpif_netdev(struct dp_netdev *dp)
{
    uint16_t netflow_id = hash_string(dp->name, 0);
    struct dpif_netdev *dpif;

    ovs_refcount_ref(&dp->ref_cnt);

    dpif = xmalloc(sizeof *dpif);
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
    dpif->dp = dp;
    dpif->last_port_seq = seq_read(dp->port_seq);

    return &dpif->dpif;
}

/* Choose an unused, non-zero port number and return it on success.
 * Return ODPP_NONE on failure. */
static odp_port_t
choose_port(struct dp_netdev *dp, const char *name)
    OVS_REQUIRES(dp->port_mutex)
{
    uint32_t port_no;

    if (dp->class != &dpif_netdev_class) {
        const char *p;
        int start_no = 0;

        /* If the port name begins with "br", start the number search at
         * 100 to make writing tests easier. */
        if (!strncmp(name, "br", 2)) {
            start_no = 100;
        }

        /* If the port name contains a number, try to assign that port number.
         * This can make writing unit tests easier because port numbers are
         * predictable. */
        for (p = name; *p != '\0'; p++) {
            if (isdigit((unsigned char) *p)) {
                port_no = start_no + strtol(p, NULL, 10);
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
                    return u32_to_odp(port_no);
                }
                break;
            }
        }
    }

    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
            return u32_to_odp(port_no);
        }
    }

    return ODPP_NONE;
}

static int
create_dp_netdev(const char *name, const struct dpif_class *class,
                 struct dp_netdev **dpp)
    OVS_REQUIRES(dp_netdev_mutex)
{
    struct dp_netdev *dp;
    int error;

    dp = xzalloc(sizeof *dp);
    shash_add(&dp_netdevs, name, dp);

    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
    ovs_refcount_init(&dp->ref_cnt);
    atomic_flag_clear(&dp->destroyed);

    ovs_mutex_init(&dp->port_mutex);
    hmap_init(&dp->ports);
    dp->port_seq = seq_create();
    fat_rwlock_init(&dp->upcall_rwlock);

    dp->reconfigure_seq = seq_create();
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);

    for (int i = 0; i < N_METER_LOCKS; ++i) {
        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
    }

    /* Disable upcalls by default. */
    dp_netdev_disable_upcall(dp);
    dp->upcall_aux = NULL;
    dp->upcall_cb = NULL;

    conntrack_init(&dp->conntrack);

    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);

    cmap_init(&dp->poll_threads);
    dp->pmd_rxq_assign_cyc = true;

    ovs_mutex_init(&dp->tx_qid_pool_mutex);
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);

    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
    ovsthread_key_create(&dp->per_pmd_key, NULL);

    ovs_mutex_lock(&dp->port_mutex);
    /* non-PMD will be created before all other threads and will
     * allocate static_tx_qid = 0. */
    dp_netdev_set_nonpmd(dp);

    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
                                                             "internal"),
                        ODPP_LOCAL);
    ovs_mutex_unlock(&dp->port_mutex);
    if (error) {
        dp_netdev_free(dp);
        return error;
    }

    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
    *dpp = dp;
    return 0;
}

static void
dp_netdev_request_reconfigure(struct dp_netdev *dp)
{
    seq_change(dp->reconfigure_seq);
}

static bool
dp_netdev_is_reconf_required(struct dp_netdev *dp)
{
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
}

static int
dpif_netdev_open(const struct dpif_class *class, const char *name,
                 bool create, struct dpif **dpifp)
{
    struct dp_netdev *dp;
    int error;

    ovs_mutex_lock(&dp_netdev_mutex);
    dp = shash_find_data(&dp_netdevs, name);
    if (!dp) {
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
    } else {
        error = (dp->class != class ? EINVAL
                 : create ? EEXIST
                 : 0);
    }
    if (!error) {
        *dpifp = create_dpif_netdev(dp);
        dp->dpif = *dpifp;
    }
    ovs_mutex_unlock(&dp_netdev_mutex);

    return error;
}

static void
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));

    /* Before freeing a lock we should release it */
    fat_rwlock_unlock(&dp->upcall_rwlock);
    fat_rwlock_destroy(&dp->upcall_rwlock);
}

static void
dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
{
    if (dp->meters[meter_id]) {
        free(dp->meters[meter_id]);
        dp->meters[meter_id] = NULL;
    }
}

/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
 * through the 'dp_netdevs' shash while freeing 'dp'. */
static void
dp_netdev_free(struct dp_netdev *dp)
    OVS_REQUIRES(dp_netdev_mutex)
{
    struct dp_netdev_port *port, *next;

    shash_find_and_delete(&dp_netdevs, dp->name);

    ovs_mutex_lock(&dp->port_mutex);
    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
        do_del_port(dp, port);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    dp_netdev_destroy_all_pmds(dp, true);
    cmap_destroy(&dp->poll_threads);

    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
    id_pool_destroy(dp->tx_qid_pool);

    ovs_mutex_destroy(&dp->non_pmd_mutex);
    ovsthread_key_delete(dp->per_pmd_key);

    conntrack_destroy(&dp->conntrack);


    seq_destroy(dp->reconfigure_seq);

    seq_destroy(dp->port_seq);
    hmap_destroy(&dp->ports);
    ovs_mutex_destroy(&dp->port_mutex);

    /* Upcalls must be disabled at this point */
    dp_netdev_destroy_upcall_lock(dp);

    int i;

    for (i = 0; i < MAX_METERS; ++i) {
        meter_lock(dp, i);
        dp_delete_meter(dp, i);
        meter_unlock(dp, i);
    }
    for (i = 0; i < N_METER_LOCKS; ++i) {
        ovs_mutex_destroy(&dp->meter_locks[i]);
    }

    free(dp->pmd_cmask);
    free(CONST_CAST(char *, dp->name));
    free(dp);
}

static void
dp_netdev_unref(struct dp_netdev *dp)
{
    if (dp) {
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
        ovs_mutex_lock(&dp_netdev_mutex);
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
            dp_netdev_free(dp);
        }
        ovs_mutex_unlock(&dp_netdev_mutex);
    }
}

static void
dpif_netdev_close(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    dp_netdev_unref(dp);
    free(dpif);
}

static int
dpif_netdev_destroy(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    if (!atomic_flag_test_and_set(&dp->destroyed)) {
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
            OVS_NOT_REACHED();
        }
    }

    return 0;
}

/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
 * load/store semantics.  While the increment is not atomic, the load and
 * store operations are, making it impossible to read inconsistent values.
 *
 * This is used to update thread local stats counters. */
static void
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
{
    unsigned long long tmp;

    atomic_read_relaxed(var, &tmp);
    tmp += n;
    atomic_store_relaxed(var, tmp);
}

static int
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;
    uint64_t pmd_stats[PMD_N_STATS];

    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        stats->n_flows += cmap_count(&pmd->flow_table);
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
    }
    stats->n_masks = UINT32_MAX;
    stats->n_mask_hit = UINT64_MAX;

    return 0;
}

static void
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
{
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
        ovs_mutex_lock(&pmd->port_mutex);
        pmd_load_cached_ports(pmd);
        ovs_mutex_unlock(&pmd->port_mutex);
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
        return;
    }

    ovs_mutex_lock(&pmd->cond_mutex);
    seq_change(pmd->reload_seq);
    atomic_store_relaxed(&pmd->reload, true);
    ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
    ovs_mutex_unlock(&pmd->cond_mutex);
}

static uint32_t
hash_port_no(odp_port_t port_no)
{
    return hash_int(odp_to_u32(port_no), 0);
}

static int
port_create(const char *devname, const char *type,
            odp_port_t port_no, struct dp_netdev_port **portp)
{
    struct netdev_saved_flags *sf;
    struct dp_netdev_port *port;
    enum netdev_flags flags;
    struct netdev *netdev;
    int error;

    *portp = NULL;

    /* Open and validate network device. */
    error = netdev_open(devname, type, &netdev);
    if (error) {
        return error;
    }
    /* XXX reject non-Ethernet devices */

    netdev_get_flags(netdev, &flags);
    if (flags & NETDEV_LOOPBACK) {
        VLOG_ERR("%s: cannot add a loopback device", devname);
        error = EINVAL;
        goto out;
    }

    error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
    if (error) {
        VLOG_ERR("%s: cannot set promisc flag", devname);
        goto out;
    }

    port = xzalloc(sizeof *port);
    port->port_no = port_no;
    port->netdev = netdev;
    port->type = xstrdup(type);
    port->sf = sf;
    port->emc_enabled = true;
    port->need_reconfigure = true;
    ovs_mutex_init(&port->txq_used_mutex);

    *portp = port;

    return 0;

out:
    netdev_close(netdev);
    return error;
}

static int
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
            odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;
    int error;

    /* Reject devices already in 'dp'. */
    if (!get_port_by_name(dp, devname, &port)) {
        return EEXIST;
    }

    error = port_create(devname, type, port_no, &port);
    if (error) {
        return error;
    }

    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
    seq_change(dp->port_seq);

    reconfigure_datapath(dp);

    return 0;
}

static int
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
                     odp_port_t *port_nop)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
    const char *dpif_port;
    odp_port_t port_no;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
    if (*port_nop != ODPP_NONE) {
        port_no = *port_nop;
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
    } else {
        port_no = choose_port(dp, dpif_port);
        error = port_no == ODPP_NONE ? EFBIG : 0;
    }
    if (!error) {
        *port_nop = port_no;
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static int
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    if (port_no == ODPP_LOCAL) {
        error = EINVAL;
    } else {
        struct dp_netdev_port *port;

        error = get_port_by_number(dp, port_no, &port);
        if (!error) {
            do_del_port(dp, port);
        }
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static bool
is_valid_port_number(odp_port_t port_no)
{
    return port_no != ODPP_NONE;
}

static struct dp_netdev_port *
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
        if (port->port_no == port_no) {
            return port;
        }
    }
    return NULL;
}

static int
get_port_by_number(struct dp_netdev *dp,
                   odp_port_t port_no, struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex)
{
    if (!is_valid_port_number(port_no)) {
        *portp = NULL;
        return EINVAL;
    } else {
        *portp = dp_netdev_lookup_port(dp, port_no);
        return *portp ? 0 : ENODEV;
    }
}

static void
port_destroy(struct dp_netdev_port *port)
{
    if (!port) {
        return;
    }

    netdev_close(port->netdev);
    netdev_restore_flags(port->sf);

    for (unsigned i = 0; i < port->n_rxq; i++) {
        netdev_rxq_close(port->rxqs[i].rx);
    }
    ovs_mutex_destroy(&port->txq_used_mutex);
    free(port->rxq_affinity_list);
    free(port->txq_used);
    free(port->rxqs);
    free(port->type);
    free(port);
}

static int
get_port_by_name(struct dp_netdev *dp,
                 const char *devname, struct dp_netdev_port **portp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
            *portp = port;
            return 0;
        }
    }

    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
     * existing port. */
    return ENODEV;
}

/* Returns 'true' if there is a port with pmd netdev. */
static bool
has_pmd_port(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_pmd(port->netdev)) {
            return true;
        }
    }

    return false;
}

static void
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
    OVS_REQUIRES(dp->port_mutex)
{
    hmap_remove(&dp->ports, &port->node);
    seq_change(dp->port_seq);

    reconfigure_datapath(dp);

    port_destroy(port);
}

static void
answer_port_query(const struct dp_netdev_port *port,
                  struct dpif_port *dpif_port)
{
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
    dpif_port->type = xstrdup(port->type);
    dpif_port->port_no = port->port_no;
}

static int
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
                                 struct dpif_port *dpif_port)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_number(dp, port_no, &port);
    if (!error && dpif_port) {
        answer_port_query(port, dpif_port);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static int
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
                               struct dpif_port *dpif_port)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error;

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_name(dp, devname, &port);
    if (!error && dpif_port) {
        answer_port_query(port, dpif_port);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return error;
}

static void
dp_netdev_flow_free(struct dp_netdev_flow *flow)
{
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
    free(flow);
}

static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
{
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
        ovsrcu_postpone(dp_netdev_flow_free, flow);
    }
}

static uint32_t
dp_netdev_flow_hash(const ovs_u128 *ufid)
{
    return ufid->u32[0];
}

static inline struct dpcls *
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
                           odp_port_t in_port)
{
    struct dpcls *cls;
    uint32_t hash = hash_port_no(in_port);
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
        if (cls->in_port == in_port) {
            /* Port classifier exists already */
            return cls;
        }
    }
    return NULL;
}

static inline struct dpcls *
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
                         odp_port_t in_port)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    uint32_t hash = hash_port_no(in_port);

    if (!cls) {
        /* Create new classifier for in_port */
        cls = xmalloc(sizeof(*cls));
        dpcls_init(cls);
        cls->in_port = in_port;
        cmap_insert(&pmd->classifiers, &cls->node, hash);
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
    }
    return cls;
}

#define MAX_FLOW_MARK       (UINT32_MAX - 1)
#define INVALID_FLOW_MARK   (UINT32_MAX)

struct megaflow_to_mark_data {
    const struct cmap_node node;
    ovs_u128 mega_ufid;
    uint32_t mark;
};

struct flow_mark {
    struct cmap megaflow_to_mark;
    struct cmap mark_to_flow;
    struct id_pool *pool;
};

static struct flow_mark flow_mark = {
    .megaflow_to_mark = CMAP_INITIALIZER,
    .mark_to_flow = CMAP_INITIALIZER,
};

static uint32_t
flow_mark_alloc(void)
{
    uint32_t mark;

    if (!flow_mark.pool) {
        /* Haven't initiated yet, do it here */
        flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
    }

    if (id_pool_alloc_id(flow_mark.pool, &mark)) {
        return mark;
    }

    return INVALID_FLOW_MARK;
}

static void
flow_mark_free(uint32_t mark)
{
    id_pool_free_id(flow_mark.pool, mark);
}

/* associate megaflow with a mark, which is a 1:1 mapping */
static void
megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
{
    size_t hash = dp_netdev_flow_hash(mega_ufid);
    struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));

    data->mega_ufid = *mega_ufid;
    data->mark = mark;

    cmap_insert(&flow_mark.megaflow_to_mark,
                CONST_CAST(struct cmap_node *, &data->node), hash);
}

/* disassociate meagaflow with a mark */
static void
megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
{
    size_t hash = dp_netdev_flow_hash(mega_ufid);
    struct megaflow_to_mark_data *data;

    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
            cmap_remove(&flow_mark.megaflow_to_mark,
                        CONST_CAST(struct cmap_node *, &data->node), hash);
            ovsrcu_postpone(free, data);
            return;
        }
    }

    VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
              UUID_ARGS((struct uuid *)mega_ufid));
}

static inline uint32_t
megaflow_to_mark_find(const ovs_u128 *mega_ufid)
{
    size_t hash = dp_netdev_flow_hash(mega_ufid);
    struct megaflow_to_mark_data *data;

    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
            return data->mark;
        }
    }

    VLOG_WARN("Mark id for ufid "UUID_FMT" was not found\n",
              UUID_ARGS((struct uuid *)mega_ufid));
    return INVALID_FLOW_MARK;
}

/* associate mark with a flow, which is 1:N mapping */
static void
mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
{
    dp_netdev_flow_ref(flow);

    cmap_insert(&flow_mark.mark_to_flow,
                CONST_CAST(struct cmap_node *, &flow->mark_node),
                hash_int(mark, 0));
    flow->mark = mark;

    VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
}

static bool
flow_mark_has_no_ref(uint32_t mark)
{
    struct dp_netdev_flow *flow;

    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
                             &flow_mark.mark_to_flow) {
        if (flow->mark == mark) {
            return false;
        }
    }

    return true;
}

static int
mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
                          struct dp_netdev_flow *flow)
{
    int ret = 0;
    uint32_t mark = flow->mark;
    struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
                                             &flow->mark_node);

    cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
    flow->mark = INVALID_FLOW_MARK;

    /*
     * no flow is referencing the mark any more? If so, let's
     * remove the flow from hardware and free the mark.
     */
    if (flow_mark_has_no_ref(mark)) {
        struct dp_netdev_port *port;
        odp_port_t in_port = flow->flow.in_port.odp_port;

        ovs_mutex_lock(&pmd->dp->port_mutex);
        port = dp_netdev_lookup_port(pmd->dp, in_port);
        if (port) {
            ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
        }
        ovs_mutex_unlock(&pmd->dp->port_mutex);

        flow_mark_free(mark);
        VLOG_DBG("Freed flow mark %u\n", mark);

        megaflow_to_mark_disassociate(&flow->mega_ufid);
    }
    dp_netdev_flow_unref(flow);

    return ret;
}

static void
flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
{
    struct dp_netdev_flow *flow;

    CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
        if (flow->pmd_id == pmd->core_id) {
            queue_netdev_flow_del(pmd, flow);
        }
    }
}

static struct dp_netdev_flow *
mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
                  const uint32_t mark)
{
    struct dp_netdev_flow *flow;

    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
                             &flow_mark.mark_to_flow) {
        if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
            flow->dead == false) {
            return flow;
        }
    }

    return NULL;
}

static struct dp_flow_offload_item *
dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
                             struct dp_netdev_flow *flow,
                             int op)
{
    struct dp_flow_offload_item *offload;

    offload = xzalloc(sizeof(*offload));
    offload->pmd = pmd;
    offload->flow = flow;
    offload->op = op;

    dp_netdev_flow_ref(flow);
    dp_netdev_pmd_try_ref(pmd);

    return offload;
}

static void
dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
{
    dp_netdev_pmd_unref(offload->pmd);
    dp_netdev_flow_unref(offload->flow);

    free(offload->actions);
    free(offload);
}

static void
dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
{
    ovs_mutex_lock(&dp_flow_offload.mutex);
    ovs_list_push_back(&dp_flow_offload.list, &offload->node);
    xpthread_cond_signal(&dp_flow_offload.cond);
    ovs_mutex_unlock(&dp_flow_offload.mutex);
}

static int
dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
{
    return mark_to_flow_disassociate(offload->pmd, offload->flow);
}

/*
 * There are two flow offload operations here: addition and modification.
 *
 * For flow addition, this function does:
 * - allocate a new flow mark id
 * - perform hardware flow offload
 * - associate the flow mark with flow and mega flow
 *
 * For flow modification, both flow mark and the associations are still
 * valid, thus only item 2 needed.
 */
static int
dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
{
    struct dp_netdev_port *port;
    struct dp_netdev_pmd_thread *pmd = offload->pmd;
    struct dp_netdev_flow *flow = offload->flow;
    odp_port_t in_port = flow->flow.in_port.odp_port;
    bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
    struct offload_info info;
    uint32_t mark;
    int ret;

    if (flow->dead) {
        return -1;
    }

    if (modification) {
        mark = flow->mark;
        ovs_assert(mark != INVALID_FLOW_MARK);
    } else {
        /*
         * If a mega flow has already been offloaded (from other PMD
         * instances), do not offload it again.
         */
        mark = megaflow_to_mark_find(&flow->mega_ufid);
        if (mark != INVALID_FLOW_MARK) {
            VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
            if (flow->mark != INVALID_FLOW_MARK) {
                ovs_assert(flow->mark == mark);
            } else {
                mark_to_flow_associate(mark, flow);
            }
            return 0;
        }

        mark = flow_mark_alloc();
        if (mark == INVALID_FLOW_MARK) {
            VLOG_ERR("Failed to allocate flow mark!\n");
        }
    }
    info.flow_mark = mark;

    ovs_mutex_lock(&pmd->dp->port_mutex);
    port = dp_netdev_lookup_port(pmd->dp, in_port);
    if (!port) {
        ovs_mutex_unlock(&pmd->dp->port_mutex);
        return -1;
    }
    ret = netdev_flow_put(port->netdev, &offload->match,
                          CONST_CAST(struct nlattr *, offload->actions),
                          offload->actions_len, &flow->mega_ufid, &info,
                          NULL);
    ovs_mutex_unlock(&pmd->dp->port_mutex);

    if (ret) {
        if (!modification) {
            flow_mark_free(mark);
        } else {
            mark_to_flow_disassociate(pmd, flow);
        }
        return -1;
    }

    if (!modification) {
        megaflow_to_mark_associate(&flow->mega_ufid, mark);
        mark_to_flow_associate(mark, flow);
    }

    return 0;
}

static void *
dp_netdev_flow_offload_main(void *data OVS_UNUSED)
{
    struct dp_flow_offload_item *offload;
    struct ovs_list *list;
    const char *op;
    int ret;

    for (;;) {
        ovs_mutex_lock(&dp_flow_offload.mutex);
        if (ovs_list_is_empty(&dp_flow_offload.list)) {
            ovsrcu_quiesce_start();
            ovs_mutex_cond_wait(&dp_flow_offload.cond,
                                &dp_flow_offload.mutex);
            ovsrcu_quiesce_end();
        }
        list = ovs_list_pop_front(&dp_flow_offload.list);
        offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
        ovs_mutex_unlock(&dp_flow_offload.mutex);

        switch (offload->op) {
        case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
            op = "add";
            ret = dp_netdev_flow_offload_put(offload);
            break;
        case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
            op = "modify";
            ret = dp_netdev_flow_offload_put(offload);
            break;
        case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
            op = "delete";
            ret = dp_netdev_flow_offload_del(offload);
            break;
        default:
            OVS_NOT_REACHED();
        }

        VLOG_DBG("%s to %s netdev flow\n",
                 ret == 0 ? "succeed" : "failed", op);
        dp_netdev_free_flow_offload(offload);
    }

    return NULL;
}

static void
queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
                      struct dp_netdev_flow *flow)
{
    struct dp_flow_offload_item *offload;

    if (ovsthread_once_start(&offload_thread_once)) {
        xpthread_cond_init(&dp_flow_offload.cond, NULL);
        ovs_thread_create("dp_netdev_flow_offload",
                          dp_netdev_flow_offload_main, NULL);
        ovsthread_once_done(&offload_thread_once);
    }

    offload = dp_netdev_alloc_flow_offload(pmd, flow,
                                           DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
    dp_netdev_append_flow_offload(offload);
}

static void
queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
                      struct dp_netdev_flow *flow, struct match *match,
                      const struct nlattr *actions, size_t actions_len)
{
    struct dp_flow_offload_item *offload;
    int op;

    if (!netdev_is_flow_api_enabled()) {
        return;
    }

    if (ovsthread_once_start(&offload_thread_once)) {
        xpthread_cond_init(&dp_flow_offload.cond, NULL);
        ovs_thread_create("dp_netdev_flow_offload",
                          dp_netdev_flow_offload_main, NULL);
        ovsthread_once_done(&offload_thread_once);
    }

    if (flow->mark != INVALID_FLOW_MARK) {
        op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
    } else {
        op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
    }
    offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
    offload->match = *match;
    offload->actions = xmalloc(actions_len);
    memcpy(offload->actions, actions, actions_len);
    offload->actions_len = actions_len;

    dp_netdev_append_flow_offload(offload);
}

static void
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
                          struct dp_netdev_flow *flow)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
    struct dpcls *cls;
    odp_port_t in_port = flow->flow.in_port.odp_port;

    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    ovs_assert(cls != NULL);
    dpcls_remove(cls, &flow->cr);
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
    if (flow->mark != INVALID_FLOW_MARK) {
        queue_netdev_flow_del(pmd, flow);
    }
    flow->dead = true;

    dp_netdev_flow_unref(flow);
}

static void
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
{
    struct dp_netdev_flow *netdev_flow;

    ovs_mutex_lock(&pmd->flow_mutex);
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
    }
    ovs_mutex_unlock(&pmd->flow_mutex);
}

static int
dpif_netdev_flow_flush(struct dpif *dpif)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        dp_netdev_pmd_flow_flush(pmd);
    }

    return 0;
}

struct dp_netdev_port_state {
    struct hmap_position position;
    char *name;
};

static int
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
{
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
    return 0;
}

static int
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
                           struct dpif_port *dpif_port)
{
    struct dp_netdev_port_state *state = state_;
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct hmap_node *node;
    int retval;

    ovs_mutex_lock(&dp->port_mutex);
    node = hmap_at_position(&dp->ports, &state->position);
    if (node) {
        struct dp_netdev_port *port;

        port = CONTAINER_OF(node, struct dp_netdev_port, node);

        free(state->name);
        state->name = xstrdup(netdev_get_name(port->netdev));
        dpif_port->name = state->name;
        dpif_port->type = port->type;
        dpif_port->port_no = port->port_no;

        retval = 0;
    } else {
        retval = EOF;
    }
    ovs_mutex_unlock(&dp->port_mutex);

    return retval;
}

static int
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
{
    struct dp_netdev_port_state *state = state_;
    free(state->name);
    free(state);
    return 0;
}

static int
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
{
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
    uint64_t new_port_seq;
    int error;

    new_port_seq = seq_read(dpif->dp->port_seq);
    if (dpif->last_port_seq != new_port_seq) {
        dpif->last_port_seq = new_port_seq;
        error = ENOBUFS;
    } else {
        error = EAGAIN;
    }

    return error;
}

static void
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
{
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);

    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
}

static struct dp_netdev_flow *
dp_netdev_flow_cast(const struct dpcls_rule *cr)
{
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
}

static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
{
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
}

/* netdev_flow_key utilities.
 *
 * netdev_flow_key is basically a miniflow.  We use these functions
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
 *
 * - Since we are dealing exclusively with miniflows created by
 *   miniflow_extract(), if the map is different the miniflow is different.
 *   Therefore we can be faster by comparing the map and the miniflow in a
 *   single memcmp().
 * - These functions can be inlined by the compiler. */

/* Given the number of bits set in miniflow's maps, returns the size of the
 * 'netdev_flow_key.mf' */
static inline size_t
netdev_flow_key_size(size_t flow_u64s)
{
    return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
}

static inline bool
netdev_flow_key_equal(const struct netdev_flow_key *a,
                      const struct netdev_flow_key *b)
{
    /* 'b->len' may be not set yet. */
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
}

/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
 * generated by miniflow_extract. */
static inline bool
netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
                         const struct miniflow *mf)
{
    return !memcmp(&key->mf, mf, key->len);
}

static inline void
netdev_flow_key_clone(struct netdev_flow_key *dst,
                      const struct netdev_flow_key *src)
{
    memcpy(dst, src,
           offsetof(struct netdev_flow_key, mf) + src->len);
}

/* Initialize a netdev_flow_key 'mask' from 'match'. */
static inline void
netdev_flow_mask_init(struct netdev_flow_key *mask,
                      const struct match *match)
{
    uint64_t *dst = miniflow_values(&mask->mf);
    struct flowmap fmap;
    uint32_t hash = 0;
    size_t idx;

    /* Only check masks that make sense for the flow. */
    flow_wc_map(&match->flow, &fmap);
    flowmap_init(&mask->mf.map);

    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);

        if (mask_u64) {
            flowmap_set(&mask->mf.map, idx, 1);
            *dst++ = mask_u64;
            hash = hash_add64(hash, mask_u64);
        }
    }

    map_t map;

    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
        hash = hash_add64(hash, map);
    }

    size_t n = dst - miniflow_get_values(&mask->mf);

    mask->hash = hash_finish(hash, n * 8);
    mask->len = netdev_flow_key_size(n);
}

/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
static inline void
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
                            const struct flow *flow,
                            const struct netdev_flow_key *mask)
{
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
    uint32_t hash = 0;
    uint64_t value;

    dst->len = mask->len;
    dst->mf = mask->mf;   /* Copy maps. */

    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
        *dst_u64 = value & *mask_u64++;
        hash = hash_add64(hash, *dst_u64++);
    }
    dst->hash = hash_finish(hash,
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
}

/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
    MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)

/* Returns a hash value for the bits of 'key' where there are 1-bits in
 * 'mask'. */
static inline uint32_t
netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
                             const struct netdev_flow_key *mask)
{
    const uint64_t *p = miniflow_get_values(&mask->mf);
    uint32_t hash = 0;
    uint64_t value;

    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
        hash = hash_add64(hash, value & *p++);
    }

    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
}

static inline bool
emc_entry_alive(struct emc_entry *ce)
{
    return ce->flow && !ce->flow->dead;
}

static void
emc_clear_entry(struct emc_entry *ce)
{
    if (ce->flow) {
        dp_netdev_flow_unref(ce->flow);
        ce->flow = NULL;
    }
}

static inline void
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
                 const struct netdev_flow_key *key)
{
    if (ce->flow != flow) {
        if (ce->flow) {
            dp_netdev_flow_unref(ce->flow);
        }

        if (dp_netdev_flow_ref(flow)) {
            ce->flow = flow;
        } else {
            ce->flow = NULL;
        }
    }
    if (key) {
        netdev_flow_key_clone(&ce->key, key);
    }
}

static inline void
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
           struct dp_netdev_flow *flow)
{
    struct emc_entry *to_be_replaced = NULL;
    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
        if (netdev_flow_key_equal(&current_entry->key, key)) {
            /* We found the entry with the 'mf' miniflow */
            emc_change_entry(current_entry, flow, NULL);
            return;
        }

        /* Replacement policy: put the flow in an empty (not alive) entry, or
         * in the first entry where it can be */
        if (!to_be_replaced
            || (emc_entry_alive(to_be_replaced)
                && !emc_entry_alive(current_entry))
            || current_entry->key.hash < to_be_replaced->key.hash) {
            to_be_replaced = current_entry;
        }
    }
    /* We didn't find the miniflow in the cache.
     * The 'to_be_replaced' entry is where the new flow will be stored */

    emc_change_entry(to_be_replaced, flow, key);
}

static inline void
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
                         const struct netdev_flow_key *key,
                         struct dp_netdev_flow *flow)
{
    /* Insert an entry into the EMC based on probability value 'min'. By
     * default the value is UINT32_MAX / 100 which yields an insertion
     * probability of 1/100 ie. 1% */

    uint32_t min = pmd->ctx.emc_insert_min;

    if (min && random_uint32() <= min) {
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
    }
}

static inline struct dp_netdev_flow *
emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
{
    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
        if (current_entry->key.hash == key->hash
            && emc_entry_alive(current_entry)
            && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {

            /* We found the entry with the 'key->mf' miniflow */
            return current_entry->flow;
        }
    }

    return NULL;
}

static inline const struct cmap_node *
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
{
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
    uint16_t sig = hash >> 16;
    uint16_t index = UINT16_MAX;

    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
        if (bucket->sig[i] == sig) {
            index = bucket->flow_idx[i];
            break;
        }
    }
    if (index != UINT16_MAX) {
        return cmap_find_by_index(&pmd->flow_table, index);
    }
    return NULL;
}

static void
smc_clear_entry(struct smc_bucket *b, int idx)
{
    b->flow_idx[idx] = UINT16_MAX;
}

/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
 * If there is already an SMC entry having same signature, the index will be
 * updated. If there is no existing entry, but an empty entry is available,
 * the empty entry will be taken. If no empty entry or existing same signature,
 * a random entry from the hashed bucket will be picked. */
static inline void
smc_insert(struct dp_netdev_pmd_thread *pmd,
           const struct netdev_flow_key *key,
           uint32_t hash)
{
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
    uint16_t index;
    uint32_t cmap_index;
    bool smc_enable_db;
    int i;

    atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
    if (!smc_enable_db) {
        return;
    }

    cmap_index = cmap_find_index(&pmd->flow_table, hash);
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;

    /* If the index is larger than SMC can handle (uint16_t), we don't
     * insert */
    if (index == UINT16_MAX) {
        return;
    }

    /* If an entry with same signature already exists, update the index */
    uint16_t sig = key->hash >> 16;
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
        if (bucket->sig[i] == sig) {
            bucket->flow_idx[i] = index;
            return;
        }
    }
    /* If there is an empty entry, occupy it. */
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
        if (bucket->flow_idx[i] == UINT16_MAX) {
            bucket->sig[i] = sig;
            bucket->flow_idx[i] = index;
            return;
        }
    }
    /* Otherwise, pick a random entry. */
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
    bucket->sig[i] = sig;
    bucket->flow_idx[i] = index;
}

static struct dp_netdev_flow *
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
                          const struct netdev_flow_key *key,
                          int *lookup_num_p)
{
    struct dpcls *cls;
    struct dpcls_rule *rule;
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
                                                     in_port.odp_port));
    struct dp_netdev_flow *netdev_flow = NULL;

    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    if (OVS_LIKELY(cls)) {
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
        netdev_flow = dp_netdev_flow_cast(rule);
    }
    return netdev_flow;
}

static struct dp_netdev_flow *
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
                        const ovs_u128 *ufidp, const struct nlattr *key,
                        size_t key_len)
{
    struct dp_netdev_flow *netdev_flow;
    struct flow flow;
    ovs_u128 ufid;

    /* If a UFID is not provided, determine one based on the key. */
    if (!ufidp && key && key_len
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
        dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
        ufidp = &ufid;
    }

    if (ufidp) {
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
                                 &pmd->flow_table) {
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
                return netdev_flow;
            }
        }
    }

    return NULL;
}

static void
get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
                    struct dpif_flow_stats *stats)
{
    struct dp_netdev_flow *netdev_flow;
    unsigned long long n;
    long long used;
    uint16_t flags;

    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);

    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
    stats->n_packets = n;
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
    stats->n_bytes = n;
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
    stats->used = used;
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
    stats->tcp_flags = flags;
}

/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
 * protect them. */
static void
dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
                            struct dpif_flow *flow, bool terse)
{
    if (terse) {
        memset(flow, 0, sizeof *flow);
    } else {
        struct flow_wildcards wc;
        struct dp_netdev_actions *actions;
        size_t offset;
        struct odp_flow_key_parms odp_parms = {
            .flow = &netdev_flow->flow,
            .mask = &wc.masks,
            .support = dp_netdev_support,
        };

        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
        /* in_port is exact matched, but we have left it out from the mask for
         * optimnization reasons. Add in_port back to the mask. */
        wc.masks.in_port.odp_port = ODPP_NONE;

        /* Key */
        offset = key_buf->size;
        flow->key = ofpbuf_tail(key_buf);
        odp_flow_key_from_flow(&odp_parms, key_buf);
        flow->key_len = key_buf->size - offset;

        /* Mask */
        offset = mask_buf->size;
        flow->mask = ofpbuf_tail(mask_buf);
        odp_parms.key_buf = key_buf;
        odp_flow_key_from_mask(&odp_parms, mask_buf);
        flow->mask_len = mask_buf->size - offset;

        /* Actions */
        actions = dp_netdev_flow_get_actions(netdev_flow);
        flow->actions = actions->actions;
        flow->actions_len = actions->size;
    }

    flow->ufid = netdev_flow->ufid;
    flow->ufid_present = true;
    flow->pmd_id = netdev_flow->pmd_id;
    get_dpif_flow_stats(netdev_flow, &flow->stats);

    flow->attrs.offloaded = false;
    flow->attrs.dp_layer = "ovs";
}

static int
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
                              const struct nlattr *mask_key,
                              uint32_t mask_key_len, const struct flow *flow,
                              struct flow_wildcards *wc, bool probe)
{
    enum odp_key_fitness fitness;

    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
    if (fitness) {
        if (!probe) {
            /* This should not happen: it indicates that
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
             * disagree on the acceptable form of a mask.  Log the problem
             * as an error, with enough details to enable debugging. */
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

            if (!VLOG_DROP_ERR(&rl)) {
                struct ds s;

                ds_init(&s);
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
                                true);
                VLOG_ERR("internal error parsing flow mask %s (%s)",
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
                ds_destroy(&s);
            }
        }

        return EINVAL;
    }

    return 0;
}

static int
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
                              struct flow *flow, bool probe)
{
    if (odp_flow_key_to_flow(key, key_len, flow)) {
        if (!probe) {
            /* This should not happen: it indicates that
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
             * the acceptable form of a flow.  Log the problem as an error,
             * with enough details to enable debugging. */
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

            if (!VLOG_DROP_ERR(&rl)) {
                struct ds s;

                ds_init(&s);
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
                ds_destroy(&s);
            }
        }

        return EINVAL;
    }

    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
        return EINVAL;
    }

    return 0;
}

static int
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_flow *netdev_flow;
    struct dp_netdev_pmd_thread *pmd;
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
    struct hmapx_node *node;
    int error = EINVAL;

    if (get->pmd_id == PMD_ID_NULL) {
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
                dp_netdev_pmd_unref(pmd);
            }
        }
    } else {
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
        if (!pmd) {
            goto out;
        }
        hmapx_add(&to_find, pmd);
    }

    if (!hmapx_count(&to_find)) {
        goto out;
    }

    HMAPX_FOR_EACH (node, &to_find) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
                                              get->key_len);
        if (netdev_flow) {
            dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
                                        get->flow, false);
            error = 0;
            break;
        } else {
            error = ENOENT;
        }
    }

    HMAPX_FOR_EACH (node, &to_find) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        dp_netdev_pmd_unref(pmd);
    }
out:
    hmapx_destroy(&to_find);
    return error;
}

static void
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
{
    struct flow masked_flow;
    size_t i;

    for (i = 0; i < sizeof(struct flow); i++) {
        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
                                       ((uint8_t *)&match->wc)[i];
    }
    dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
}

static struct dp_netdev_flow *
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
                   struct match *match, const ovs_u128 *ufid,
                   const struct nlattr *actions, size_t actions_len)
    OVS_REQUIRES(pmd->flow_mutex)
{
    struct dp_netdev_flow *flow;
    struct netdev_flow_key mask;
    struct dpcls *cls;

    /* Make sure in_port is exact matched before we read it. */
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
    odp_port_t in_port = match->flow.in_port.odp_port;

    /* As we select the dpcls based on the port number, each netdev flow
     * belonging to the same dpcls will have the same odp_port value.
     * For performance reasons we wildcard odp_port here in the mask.  In the
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
     * will not be part of the subtable mask.
     * This will speed up the hash computation during dpcls_lookup() because
     * there is one less call to hash_add64() in this case. */
    match->wc.masks.in_port.odp_port = 0;
    netdev_flow_mask_init(&mask, match);
    match->wc.masks.in_port.odp_port = ODPP_NONE;

    /* Make sure wc does not have metadata. */
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));

    /* Do not allocate extra space. */
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
    memset(&flow->stats, 0, sizeof flow->stats);
    flow->dead = false;
    flow->batch = NULL;
    flow->mark = INVALID_FLOW_MARK;
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
    ovs_refcount_init(&flow->ref_cnt);
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));

    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);

    /* Select dpcls for in_port. Relies on in_port to be exact match. */
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
    dpcls_insert(cls, &flow->cr, &mask);

    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
                dp_netdev_flow_hash(&flow->ufid));

    queue_netdev_flow_put(pmd, flow, match, actions, actions_len);

    if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
        struct ds ds = DS_EMPTY_INITIALIZER;
        struct ofpbuf key_buf, mask_buf;
        struct odp_flow_key_parms odp_parms = {
            .flow = &match->flow,
            .mask = &match->wc.masks,
            .support = dp_netdev_support,
        };

        ofpbuf_init(&key_buf, 0);
        ofpbuf_init(&mask_buf, 0);

        odp_flow_key_from_flow(&odp_parms, &key_buf);
        odp_parms.key_buf = &key_buf;
        odp_flow_key_from_mask(&odp_parms, &mask_buf);

        ds_put_cstr(&ds, "flow_add: ");
        odp_format_ufid(ufid, &ds);
        ds_put_cstr(&ds, " ");
        odp_flow_format(key_buf.data, key_buf.size,
                        mask_buf.data, mask_buf.size,
                        NULL, &ds, false);
        ds_put_cstr(&ds, ", actions:");
        format_odp_actions(&ds, actions, actions_len, NULL);

        VLOG_DBG("%s", ds_cstr(&ds));

        ofpbuf_uninit(&key_buf);
        ofpbuf_uninit(&mask_buf);

        /* Add a printout of the actual match installed. */
        struct match m;
        ds_clear(&ds);
        ds_put_cstr(&ds, "flow match: ");
        miniflow_expand(&flow->cr.flow.mf, &m.flow);
        miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
        memset(&m.tun_md, 0, sizeof m.tun_md);
        match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);

        VLOG_DBG("%s", ds_cstr(&ds));

        ds_destroy(&ds);
    }

    return flow;
}

static int
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
                struct netdev_flow_key *key,
                struct match *match,
                ovs_u128 *ufid,
                const struct dpif_flow_put *put,
                struct dpif_flow_stats *stats)
{
    struct dp_netdev_flow *netdev_flow;
    int error = 0;

    if (stats) {
        memset(stats, 0, sizeof *stats);
    }

    ovs_mutex_lock(&pmd->flow_mutex);
    netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
    if (!netdev_flow) {
        if (put->flags & DPIF_FP_CREATE) {
            if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
                dp_netdev_flow_add(pmd, match, ufid, put->actions,
                                   put->actions_len);
                error = 0;
            } else {
                error = EFBIG;
            }
        } else {
            error = ENOENT;
        }
    } else {
        if (put->flags & DPIF_FP_MODIFY) {
            struct dp_netdev_actions *new_actions;
            struct dp_netdev_actions *old_actions;

            new_actions = dp_netdev_actions_create(put->actions,
                                                   put->actions_len);

            old_actions = dp_netdev_flow_get_actions(netdev_flow);
            ovsrcu_set(&netdev_flow->actions, new_actions);

            queue_netdev_flow_put(pmd, netdev_flow, match,
                                  put->actions, put->actions_len);

            if (stats) {
                get_dpif_flow_stats(netdev_flow, stats);
            }
            if (put->flags & DPIF_FP_ZERO_STATS) {
                /* XXX: The userspace datapath uses thread local statistics
                 * (for flows), which should be updated only by the owning
                 * thread.  Since we cannot write on stats memory here,
                 * we choose not to support this flag.  Please note:
                 * - This feature is currently used only by dpctl commands with
                 *   option --clear.
                 * - Should the need arise, this operation can be implemented
                 *   by keeping a base value (to be update here) for each
                 *   counter, and subtracting it before outputting the stats */
                error = EOPNOTSUPP;
            }

            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
        } else if (put->flags & DPIF_FP_CREATE) {
            error = EEXIST;
        } else {
            /* Overlapping flow. */
            error = EINVAL;
        }
    }
    ovs_mutex_unlock(&pmd->flow_mutex);
    return error;
}

static int
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct netdev_flow_key key, mask;
    struct dp_netdev_pmd_thread *pmd;
    struct match match;
    ovs_u128 ufid;
    int error;
    bool probe = put->flags & DPIF_FP_PROBE;

    if (put->stats) {
        memset(put->stats, 0, sizeof *put->stats);
    }
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
                                          probe);
    if (error) {
        return error;
    }
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
                                          put->mask, put->mask_len,
                                          &match.flow, &match.wc, probe);
    if (error) {
        return error;
    }

    if (put->ufid) {
        ufid = *put->ufid;
    } else {
        dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
    }

    /* The Netlink encoding of datapath flow keys cannot express
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
     * tag is interpreted as exact match on the fact that there is no
     * VLAN.  Unless we refactor a lot of code that translates between
     * Netlink and struct flow representations, we have to do the same
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
    if (!match.wc.masks.vlans[0].tci) {
        match.wc.masks.vlans[0].tci = htons(0xffff);
    }

    /* Must produce a netdev_flow_key for lookup.
     * Use the same method as employed to create the key when adding
     * the flow to the dplcs to make sure they match. */
    netdev_flow_mask_init(&mask, &match);
    netdev_flow_key_init_masked(&key, &match.flow, &mask);

    if (put->pmd_id == PMD_ID_NULL) {
        if (cmap_count(&dp->poll_threads) == 0) {
            return EINVAL;
        }
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
            struct dpif_flow_stats pmd_stats;
            int pmd_error;

            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
                                        &pmd_stats);
            if (pmd_error) {
                error = pmd_error;
            } else if (put->stats) {
                put->stats->n_packets += pmd_stats.n_packets;
                put->stats->n_bytes += pmd_stats.n_bytes;
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
            }
        }
    } else {
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
        if (!pmd) {
            return EINVAL;
        }
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
        dp_netdev_pmd_unref(pmd);
    }

    return error;
}

static int
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
                struct dpif_flow_stats *stats,
                const struct dpif_flow_del *del)
{
    struct dp_netdev_flow *netdev_flow;
    int error = 0;

    ovs_mutex_lock(&pmd->flow_mutex);
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
                                          del->key_len);
    if (netdev_flow) {
        if (stats) {
            get_dpif_flow_stats(netdev_flow, stats);
        }
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
    } else {
        error = ENOENT;
    }
    ovs_mutex_unlock(&pmd->flow_mutex);

    return error;
}

static int
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;
    int error = 0;

    if (del->stats) {
        memset(del->stats, 0, sizeof *del->stats);
    }

    if (del->pmd_id == PMD_ID_NULL) {
        if (cmap_count(&dp->poll_threads) == 0) {
            return EINVAL;
        }
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
            struct dpif_flow_stats pmd_stats;
            int pmd_error;

            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
            if (pmd_error) {
                error = pmd_error;
            } else if (del->stats) {
                del->stats->n_packets += pmd_stats.n_packets;
                del->stats->n_bytes += pmd_stats.n_bytes;
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
            }
        }
    } else {
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
        if (!pmd) {
            return EINVAL;
        }
        error = flow_del_on_pmd(pmd, del->stats, del);
        dp_netdev_pmd_unref(pmd);
    }


    return error;
}

struct dpif_netdev_flow_dump {
    struct dpif_flow_dump up;
    struct cmap_position poll_thread_pos;
    struct cmap_position flow_pos;
    struct dp_netdev_pmd_thread *cur_pmd;
    int status;
    struct ovs_mutex mutex;
};

static struct dpif_netdev_flow_dump *
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
{
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
}

static struct dpif_flow_dump *
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
                             struct dpif_flow_dump_types *types OVS_UNUSED)
{
    struct dpif_netdev_flow_dump *dump;

    dump = xzalloc(sizeof *dump);
    dpif_flow_dump_init(&dump->up, dpif_);
    dump->up.terse = terse;
    ovs_mutex_init(&dump->mutex);

    return &dump->up;
}

static int
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
{
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);

    ovs_mutex_destroy(&dump->mutex);
    free(dump);
    return 0;
}

struct dpif_netdev_flow_dump_thread {
    struct dpif_flow_dump_thread up;
    struct dpif_netdev_flow_dump *dump;
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
};

static struct dpif_netdev_flow_dump_thread *
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
{
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
}

static struct dpif_flow_dump_thread *
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
{
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
    struct dpif_netdev_flow_dump_thread *thread;

    thread = xmalloc(sizeof *thread);
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
    thread->dump = dump;
    return &thread->up;
}

static void
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
{
    struct dpif_netdev_flow_dump_thread *thread
        = dpif_netdev_flow_dump_thread_cast(thread_);

    free(thread);
}

static int
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
                           struct dpif_flow *flows, int max_flows)
{
    struct dpif_netdev_flow_dump_thread *thread
        = dpif_netdev_flow_dump_thread_cast(thread_);
    struct dpif_netdev_flow_dump *dump = thread->dump;
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
    int n_flows = 0;
    int i;

    ovs_mutex_lock(&dump->mutex);
    if (!dump->status) {
        struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
        struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);

        /* First call to dump_next(), extracts the first pmd thread.
         * If there is no pmd thread, returns immediately. */
        if (!pmd) {
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
            if (!pmd) {
                ovs_mutex_unlock(&dump->mutex);
                return n_flows;

            }
        }

        do {
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
                struct cmap_node *node;

                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
                if (!node) {
                    break;
                }
                netdev_flows[n_flows] = CONTAINER_OF(node,
                                                     struct dp_netdev_flow,
                                                     node);
            }
            /* When finishing dumping the current pmd thread, moves to
             * the next. */
            if (n_flows < flow_limit) {
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
                dp_netdev_pmd_unref(pmd);
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
                if (!pmd) {
                    dump->status = EOF;
                    break;
                }
            }
            /* Keeps the reference to next caller. */
            dump->cur_pmd = pmd;

            /* If the current dump is empty, do not exit the loop, since the
             * remaining pmds could have flows to be dumped.  Just dumps again
             * on the new 'pmd'. */
        } while (!n_flows);
    }
    ovs_mutex_unlock(&dump->mutex);

    for (i = 0; i < n_flows; i++) {
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
        struct dpif_flow *f = &flows[i];
        struct ofpbuf key, mask;

        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
        dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
                                    dump->up.terse);
    }

    return n_flows;
}

static int
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *pmd;
    struct dp_packet_batch pp;

    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
        dp_packet_size(execute->packet) > UINT16_MAX) {
        return EINVAL;
    }

    /* Tries finding the 'pmd'.  If NULL is returned, that means
     * the current thread is a non-pmd thread and should use
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
    pmd = ovsthread_getspecific(dp->per_pmd_key);
    if (!pmd) {
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
        if (!pmd) {
            return EBUSY;
        }
    }

    if (execute->probe) {
        /* If this is part of a probe, Drop the packet, since executing
         * the action may actually cause spurious packets be sent into
         * the network. */
        if (pmd->core_id == NON_PMD_CORE_ID) {
            dp_netdev_pmd_unref(pmd);
        }
        return 0;
    }

    /* If the current thread is non-pmd thread, acquires
     * the 'non_pmd_mutex'. */
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
    }

    /* Update current time in PMD context. We don't care about EMC insertion
     * probability, because we are on a slow path. */
    pmd_thread_ctx_time_update(pmd);

    /* The action processing expects the RSS hash to be valid, because
     * it's always initialized at the beginning of datapath processing.
     * In this case, though, 'execute->packet' may not have gone through
     * the datapath at all, it may have been generated by the upper layer
     * (OpenFlow packet-out, BFD frame, ...). */
    if (!dp_packet_rss_valid(execute->packet)) {
        dp_packet_set_rss_hash(execute->packet,
                               flow_hash_5tuple(execute->flow, 0));
    }

    dp_packet_batch_init_packet(&pp, execute->packet);
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
                              execute->actions, execute->actions_len);
    dp_netdev_pmd_flush_output_packets(pmd, true);

    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_unlock(&dp->non_pmd_mutex);
        dp_netdev_pmd_unref(pmd);
    }

    return 0;
}

static void
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
                    enum dpif_offload_type offload_type OVS_UNUSED)
{
    size_t i;

    for (i = 0; i < n_ops; i++) {
        struct dpif_op *op = ops[i];

        switch (op->type) {
        case DPIF_OP_FLOW_PUT:
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
            break;

        case DPIF_OP_FLOW_DEL:
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
            break;

        case DPIF_OP_EXECUTE:
            op->error = dpif_netdev_execute(dpif, &op->execute);
            break;

        case DPIF_OP_FLOW_GET:
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
            break;
        }
    }
}

/* Enable or Disable PMD auto load balancing. */
static void
set_pmd_auto_lb(struct dp_netdev *dp)
{
    unsigned int cnt = 0;
    struct dp_netdev_pmd_thread *pmd;
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;

    bool enable_alb = false;
    bool multi_rxq = false;
    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;

    /* Ensure that there is at least 2 non-isolated PMDs and
     * one of them is polling more than one rxq. */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
            continue;
        }

        if (hmap_count(&pmd->poll_list) > 1) {
            multi_rxq = true;
        }
        if (cnt && multi_rxq) {
                enable_alb = true;
                break;
        }
        cnt++;
    }

    /* Enable auto LB if it is requested and cycle based assignment is true. */
    enable_alb = enable_alb && pmd_rxq_assign_cyc &&
                    pmd_alb->auto_lb_requested;

    if (pmd_alb->is_enabled != enable_alb) {
        pmd_alb->is_enabled = enable_alb;
        if (pmd_alb->is_enabled) {
            VLOG_INFO("PMD auto load balance is enabled "
                      "(with rebalance interval:%"PRIu64" msec)",
                       pmd_alb->rebalance_intvl);
        } else {
            pmd_alb->rebalance_poll_timer = 0;
            VLOG_INFO("PMD auto load balance is disabled");
        }
    }

}

/* Applies datapath configuration from the database. Some of the changes are
 * actually applied in dpif_netdev_run(). */
static int
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
                                             "cycles");
    unsigned long long insert_prob =
        smap_get_ullong(other_config, "emc-insert-inv-prob",
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
    uint32_t insert_min, cur_min;
    uint32_t tx_flush_interval, cur_tx_flush_interval;
    uint64_t rebalance_intvl;

    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
                                     DEFAULT_TX_FLUSH_INTERVAL);
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
    if (tx_flush_interval != cur_tx_flush_interval) {
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
                  tx_flush_interval);
    }

    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
        free(dp->pmd_cmask);
        dp->pmd_cmask = nullable_xstrdup(cmask);
        dp_netdev_request_reconfigure(dp);
    }

    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
    if (insert_prob <= UINT32_MAX) {
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
    } else {
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
    }

    if (insert_min != cur_min) {
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
        if (insert_min == 0) {
            VLOG_INFO("EMC insertion probability changed to zero");
        } else {
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
                      insert_prob, (100 / (float)insert_prob));
        }
    }

    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
    bool cur_perf_enabled;
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
    if (perf_enabled != cur_perf_enabled) {
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
        if (perf_enabled) {
            VLOG_INFO("PMD performance metrics collection enabled");
        } else {
            VLOG_INFO("PMD performance metrics collection disabled");
        }
    }

    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
    bool cur_smc;
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
    if (smc_enable != cur_smc) {
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
        if (smc_enable) {
            VLOG_INFO("SMC cache is enabled");
        } else {
            VLOG_INFO("SMC cache is disabled");
        }
    }

    bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
    if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
        VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
                      "Defaulting to 'cycles'.");
        pmd_rxq_assign_cyc = true;
        pmd_rxq_assign = "cycles";
    }
    if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
        dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
                  pmd_rxq_assign);
        dp_netdev_request_reconfigure(dp);
    }

    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
    pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
                              false);

    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
                              ALB_PMD_REBALANCE_POLL_INTERVAL);

    /* Input is in min, convert it to msec. */
    rebalance_intvl =
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;

    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
        pmd_alb->rebalance_intvl = rebalance_intvl;
    }

    set_pmd_auto_lb(dp);
    return 0;
}

/* Parses affinity list and returns result in 'core_ids'. */
static int
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
{
    unsigned i;
    char *list, *copy, *key, *value;
    int error = 0;

    for (i = 0; i < n_rxq; i++) {
        core_ids[i] = OVS_CORE_UNSPEC;
    }

    if (!affinity_list) {
        return 0;
    }

    list = copy = xstrdup(affinity_list);

    while (ofputil_parse_key_value(&list, &key, &value)) {
        int rxq_id, core_id;

        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
            error = EINVAL;
            break;
        }

        if (rxq_id < n_rxq) {
            core_ids[rxq_id] = core_id;
        }
    }

    free(copy);
    return error;
}

/* Parses 'affinity_list' and applies configuration if it is valid. */
static int
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
                                  const char *affinity_list)
{
    unsigned *core_ids, i;
    int error = 0;

    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
        error = EINVAL;
        goto exit;
    }

    for (i = 0; i < port->n_rxq; i++) {
        port->rxqs[i].core_id = core_ids[i];
    }

exit:
    free(core_ids);
    return error;
}

/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
 * of given PMD thread. */
static bool
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
                           struct dp_netdev_port *port)
    OVS_EXCLUDED(pmd->port_mutex)
{
    struct rxq_poll *poll;
    bool found = false;

    ovs_mutex_lock(&pmd->port_mutex);
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
        if (port == poll->rxq->port) {
            found = true;
            break;
        }
    }
    ovs_mutex_unlock(&pmd->port_mutex);
    return found;
}

/* Updates port configuration from the database.  The changes are actually
 * applied in dpif_netdev_run(). */
static int
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
                            const struct smap *cfg)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_port *port;
    int error = 0;
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);

    ovs_mutex_lock(&dp->port_mutex);
    error = get_port_by_number(dp, port_no, &port);
    if (error) {
        goto unlock;
    }

    if (emc_enabled != port->emc_enabled) {
        struct dp_netdev_pmd_thread *pmd;
        struct ds ds = DS_EMPTY_INITIALIZER;
        uint32_t cur_min, insert_prob;

        port->emc_enabled = emc_enabled;
        /* Mark for reload all the threads that polls this port and request
         * for reconfiguration for the actual reloading of threads. */
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
                pmd->need_reload = true;
            }
        }
        dp_netdev_request_reconfigure(dp);

        ds_put_format(&ds, "%s: EMC has been %s.",
                      netdev_get_name(port->netdev),
                      (emc_enabled) ? "enabled" : "disabled");
        if (emc_enabled) {
            ds_put_cstr(&ds, " Current insertion probability is ");
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
            if (!cur_min) {
                ds_put_cstr(&ds, "zero.");
            } else {
                insert_prob = UINT32_MAX / cur_min;
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
                              insert_prob, 100 / (float) insert_prob);
            }
        }
        VLOG_INFO("%s", ds_cstr(&ds));
        ds_destroy(&ds);
    }

    /* Checking for RXq affinity changes. */
    if (!netdev_is_pmd(port->netdev)
        || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
        goto unlock;
    }

    error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
    if (error) {
        goto unlock;
    }
    free(port->rxq_affinity_list);
    port->rxq_affinity_list = nullable_xstrdup(affinity_list);

    dp_netdev_request_reconfigure(dp);
unlock:
    ovs_mutex_unlock(&dp->port_mutex);
    return error;
}

static int
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
                              uint32_t queue_id, uint32_t *priority)
{
    *priority = queue_id;
    return 0;
}


/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
 * a copy of the 'size' bytes of 'actions' input parameters. */
struct dp_netdev_actions *
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
{
    struct dp_netdev_actions *netdev_actions;

    netdev_actions = xmalloc(sizeof *netdev_actions + size);
    memcpy(netdev_actions->actions, actions, size);
    netdev_actions->size = size;

    return netdev_actions;
}

struct dp_netdev_actions *
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
{
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
}

static void
dp_netdev_actions_free(struct dp_netdev_actions *actions)
{
    free(actions);
}

static void
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
                         enum rxq_cycles_counter_type type,
                         unsigned long long cycles)
{
   atomic_store_relaxed(&rx->cycles[type], cycles);
}

static void
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
                         enum rxq_cycles_counter_type type,
                         unsigned long long cycles)
{
    non_atomic_ullong_add(&rx->cycles[type], cycles);
}

static uint64_t
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
                         enum rxq_cycles_counter_type type)
{
    unsigned long long processing_cycles;
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
    return processing_cycles;
}

static void
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
                                unsigned long long cycles)
{
    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
}

static uint64_t
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
{
    unsigned long long processing_cycles;
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
    return processing_cycles;
}

#if ATOMIC_ALWAYS_LOCK_FREE_8B
static inline bool
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
{
    bool pmd_perf_enabled;
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
    return pmd_perf_enabled;
}
#else
/* If stores and reads of 64-bit integers are not atomic, the full PMD
 * performance metrics are not available as locked access to 64 bit
 * integers would be prohibitively expensive. */
static inline bool
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
{
    return false;
}
#endif

static int
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
                                   struct tx_port *p)
{
    int i;
    int tx_qid;
    int output_cnt;
    bool dynamic_txqs;
    struct cycle_timer timer;
    uint64_t cycles;
    uint32_t tx_flush_interval;

    cycle_timer_start(&pmd->perf_stats, &timer);

    dynamic_txqs = p->port->dynamic_txqs;
    if (dynamic_txqs) {
        tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
    } else {
        tx_qid = pmd->static_tx_qid;
    }

    output_cnt = dp_packet_batch_size(&p->output_pkts);
    ovs_assert(output_cnt > 0);

    netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
    dp_packet_batch_init(&p->output_pkts);

    /* Update time of the next flush. */
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
    p->flush_time = pmd->ctx.now + tx_flush_interval;

    ovs_assert(pmd->n_output_batches > 0);
    pmd->n_output_batches--;

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);

    /* Distribute send cycles evenly among transmitted packets and assign to
     * their respective rx queues. */
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
    for (i = 0; i < output_cnt; i++) {
        if (p->output_pkts_rxqs[i]) {
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
                                     RXQ_CYCLES_PROC_CURR, cycles);
        }
    }

    return output_cnt;
}

static int
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
                                   bool force)
{
    struct tx_port *p;
    int output_cnt = 0;

    if (!pmd->n_output_batches) {
        return 0;
    }

    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
        if (!dp_packet_batch_is_empty(&p->output_pkts)
            && (force || pmd->ctx.now >= p->flush_time)) {
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
        }
    }
    return output_cnt;
}

static int
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
                           struct dp_netdev_rxq *rxq,
                           odp_port_t port_no)
{
    struct pmd_perf_stats *s = &pmd->perf_stats;
    struct dp_packet_batch batch;
    struct cycle_timer timer;
    int error;
    int batch_cnt = 0;
    int rem_qlen = 0, *qlen_p = NULL;
    uint64_t cycles;

    /* Measure duration for polling and processing rx burst. */
    cycle_timer_start(&pmd->perf_stats, &timer);

    pmd->ctx.last_rxq = rxq;
    dp_packet_batch_init(&batch);

    /* Fetch the rx queue length only for vhostuser ports. */
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
        qlen_p = &rem_qlen;
    }

    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
    if (!error) {
        /* At least one packet received. */
        *recirc_depth_get() = 0;
        pmd_thread_ctx_time_update(pmd);
        batch_cnt = batch.count;
        if (pmd_perf_metrics_enabled(pmd)) {
            /* Update batch histogram. */
            s->current.batches++;
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
            /* Update the maximum vhost rx queue fill level. */
            if (rxq->is_vhost && rem_qlen >= 0) {
                uint32_t qfill = batch_cnt + rem_qlen;
                if (qfill > s->current.max_vhost_qfill) {
                    s->current.max_vhost_qfill = qfill;
                }
            }
        }
        /* Process packet batch. */
        dp_netdev_input(pmd, &batch, port_no);

        /* Assign processing cycles to rx queue. */
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);

        dp_netdev_pmd_flush_output_packets(pmd, false);
    } else {
        /* Discard cycles. */
        cycle_timer_stop(&pmd->perf_stats, &timer);
        if (error != EAGAIN && error != EOPNOTSUPP) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
        }
    }

    pmd->ctx.last_rxq = NULL;

    return batch_cnt;
}

static struct tx_port *
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
{
    struct tx_port *tx;

    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
        if (tx->port->port_no == port_no) {
            return tx;
        }
    }

    return NULL;
}

static int
port_reconfigure(struct dp_netdev_port *port)
{
    struct netdev *netdev = port->netdev;
    int i, err;

    /* Closes the existing 'rxq's. */
    for (i = 0; i < port->n_rxq; i++) {
        netdev_rxq_close(port->rxqs[i].rx);
        port->rxqs[i].rx = NULL;
    }
    unsigned last_nrxq = port->n_rxq;
    port->n_rxq = 0;

    /* Allows 'netdev' to apply the pending configuration changes. */
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
        err = netdev_reconfigure(netdev);
        if (err && (err != EOPNOTSUPP)) {
            VLOG_ERR("Failed to set interface %s new configuration",
                     netdev_get_name(netdev));
            return err;
        }
    }
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
    port->rxqs = xrealloc(port->rxqs,
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
    /* Realloc 'used' counters for tx queues. */
    free(port->txq_used);
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);

    for (i = 0; i < netdev_n_rxq(netdev); i++) {
        bool new_queue = i >= last_nrxq;
        if (new_queue) {
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
        }

        port->rxqs[i].port = port;
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);

        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
        if (err) {
            return err;
        }
        port->n_rxq++;
    }

    /* Parse affinity list to apply configuration for new queues. */
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);

    /* If reconfiguration was successful mark it as such, so we can use it */
    port->need_reconfigure = false;

    return 0;
}

struct rr_numa_list {
    struct hmap numas;  /* Contains 'struct rr_numa' */
};

struct rr_numa {
    struct hmap_node node;

    int numa_id;

    /* Non isolated pmds on numa node 'numa_id' */
    struct dp_netdev_pmd_thread **pmds;
    int n_pmds;

    int cur_index;
    bool idx_inc;
};

static struct rr_numa *
rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
{
    struct rr_numa *numa;

    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
        if (numa->numa_id == numa_id) {
            return numa;
        }
    }

    return NULL;
}

/* Returns the next node in numa list following 'numa' in round-robin fashion.
 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
 * Returns NULL if 'rr' numa list is empty. */
static struct rr_numa *
rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
{
    struct hmap_node *node = NULL;

    if (numa) {
        node = hmap_next(&rr->numas, &numa->node);
    }
    if (!node) {
        node = hmap_first(&rr->numas);
    }

    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
}

static void
rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
{
    struct dp_netdev_pmd_thread *pmd;
    struct rr_numa *numa;

    hmap_init(&rr->numas);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
            continue;
        }

        numa = rr_numa_list_lookup(rr, pmd->numa_id);
        if (!numa) {
            numa = xzalloc(sizeof *numa);
            numa->numa_id = pmd->numa_id;
            hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
        }
        numa->n_pmds++;
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
        numa->pmds[numa->n_pmds - 1] = pmd;
        /* At least one pmd so initialise curr_idx and idx_inc. */
        numa->cur_index = 0;
        numa->idx_inc = true;
    }
}

/*
 * Returns the next pmd from the numa node.
 *
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
 * either an up or down walk, switching between up/down when the first or last
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
 *
 * If 'updown' is 'false' it will select the next pmd wrapping around when last
 * core reached. e.g. 1,2,3,1,2,3,1,2...
 */
static struct dp_netdev_pmd_thread *
rr_numa_get_pmd(struct rr_numa *numa, bool updown)
{
    int numa_idx = numa->cur_index;

    if (numa->idx_inc == true) {
        /* Incrementing through list of pmds. */
        if (numa->cur_index == numa->n_pmds-1) {
            /* Reached the last pmd. */
            if (updown) {
                numa->idx_inc = false;
            } else {
                numa->cur_index = 0;
            }
        } else {
            numa->cur_index++;
        }
    } else {
        /* Decrementing through list of pmds. */
        if (numa->cur_index == 0) {
            /* Reached the first pmd. */
            numa->idx_inc = true;
        } else {
            numa->cur_index--;
        }
    }
    return numa->pmds[numa_idx];
}

static void
rr_numa_list_destroy(struct rr_numa_list *rr)
{
    struct rr_numa *numa;

    HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
        free(numa->pmds);
        free(numa);
    }
    hmap_destroy(&rr->numas);
}

/* Sort Rx Queues by the processing cycles they are consuming. */
static int
compare_rxq_cycles(const void *a, const void *b)
{
    struct dp_netdev_rxq *qa;
    struct dp_netdev_rxq *qb;
    uint64_t cycles_qa, cycles_qb;

    qa = *(struct dp_netdev_rxq **) a;
    qb = *(struct dp_netdev_rxq **) b;

    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);

    if (cycles_qa != cycles_qb) {
        return (cycles_qa < cycles_qb) ? 1 : -1;
    } else {
        /* Cycles are the same so tiebreak on port/queue id.
         * Tiebreaking (as opposed to return 0) ensures consistent
         * sort results across multiple OS's. */
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
        if (port_qa != port_qb) {
            return port_qa > port_qb ? 1 : -1;
        } else {
            return netdev_rxq_get_queue_id(qa->rx)
                    - netdev_rxq_get_queue_id(qb->rx);
        }
    }
}

/* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
 * queues and marks the pmds as isolated.  Otherwise, assign non isolated
 * pmds to unpinned queues.
 *
 * The function doesn't touch the pmd threads, it just stores the assignment
 * in the 'pmd' member of each rxq. */
static void
rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;
    struct rr_numa_list rr;
    struct rr_numa *non_local_numa = NULL;
    struct dp_netdev_rxq ** rxqs = NULL;
    int n_rxqs = 0;
    struct rr_numa *numa = NULL;
    int numa_id;
    bool assign_cyc = dp->pmd_rxq_assign_cyc;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (!netdev_is_pmd(port->netdev)) {
            continue;
        }

        for (int qid = 0; qid < port->n_rxq; qid++) {
            struct dp_netdev_rxq *q = &port->rxqs[qid];

            if (pinned && q->core_id != OVS_CORE_UNSPEC) {
                struct dp_netdev_pmd_thread *pmd;

                pmd = dp_netdev_get_pmd(dp, q->core_id);
                if (!pmd) {
                    VLOG_WARN("There is no PMD thread on core %d. Queue "
                              "%d on port \'%s\' will not be polled.",
                              q->core_id, qid, netdev_get_name(port->netdev));
                } else {
                    q->pmd = pmd;
                    pmd->isolated = true;
                    dp_netdev_pmd_unref(pmd);
                }
            } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
                uint64_t cycle_hist = 0;

                if (n_rxqs == 0) {
                    rxqs = xmalloc(sizeof *rxqs);
                } else {
                    rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
                }

                if (assign_cyc) {
                    /* Sum the queue intervals and store the cycle history. */
                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
                        cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
                    }
                    dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
                                             cycle_hist);
                }
                /* Store the queue. */
                rxqs[n_rxqs++] = q;
            }
        }
    }

    if (n_rxqs > 1 && assign_cyc) {
        /* Sort the queues in order of the processing cycles
         * they consumed during their last pmd interval. */
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
    }

    rr_numa_list_populate(dp, &rr);
    /* Assign the sorted queues to pmds in round robin. */
    for (int i = 0; i < n_rxqs; i++) {
        numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
        numa = rr_numa_list_lookup(&rr, numa_id);
        if (!numa) {
            /* There are no pmds on the queue's local NUMA node.
               Round robin on the NUMA nodes that do have pmds. */
            non_local_numa = rr_numa_list_next(&rr, non_local_numa);
            if (!non_local_numa) {
                VLOG_ERR("There is no available (non-isolated) pmd "
                         "thread for port \'%s\' queue %d. This queue "
                         "will not be polled. Is pmd-cpu-mask set to "
                         "zero? Or are all PMDs isolated to other "
                         "queues?", netdev_rxq_get_name(rxqs[i]->rx),
                         netdev_rxq_get_queue_id(rxqs[i]->rx));
                continue;
            }
            rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
            VLOG_WARN("There's no available (non-isolated) pmd thread "
                      "on numa node %d. Queue %d on port \'%s\' will "
                      "be assigned to the pmd on core %d "
                      "(numa node %d). Expect reduced performance.",
                      numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
                      netdev_rxq_get_name(rxqs[i]->rx),
                      rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
        } else {
            rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
            if (assign_cyc) {
                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
                          "rx queue %d "
                          "(measured processing cycles %"PRIu64").",
                          rxqs[i]->pmd->core_id, numa_id,
                          netdev_rxq_get_name(rxqs[i]->rx),
                          netdev_rxq_get_queue_id(rxqs[i]->rx),
                          dp_netdev_rxq_get_cycles(rxqs[i],
                                                   RXQ_CYCLES_PROC_HIST));
            } else {
                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
                          "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
                          netdev_rxq_get_name(rxqs[i]->rx),
                          netdev_rxq_get_queue_id(rxqs[i]->rx));
            }
        }
    }

    rr_numa_list_destroy(&rr);
    free(rxqs);
}

static void
reload_affected_pmds(struct dp_netdev *dp)
{
    struct dp_netdev_pmd_thread *pmd;

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (pmd->need_reload) {
            flow_mark_flush(pmd);
            dp_netdev_reload_pmd__(pmd);
            pmd->need_reload = false;
        }
    }
}

static void
reconfigure_pmd_threads(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_pmd_thread *pmd;
    struct ovs_numa_dump *pmd_cores;
    struct ovs_numa_info_core *core;
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
    struct hmapx_node *node;
    bool changed = false;
    bool need_to_adjust_static_tx_qids = false;

    /* The pmd threads should be started only if there's a pmd port in the
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
     * NR_PMD_THREADS per numa node. */
    if (!has_pmd_port(dp)) {
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
    } else {
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
    }

    /* We need to adjust 'static_tx_qid's only if we're reducing number of
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
        /* Adjustment is required to keep 'static_tx_qid's sequential and
         * avoid possible issues, for example, imbalanced tx queue usage
         * and unnecessary locking caused by remapping on netdev level. */
        need_to_adjust_static_tx_qids = true;
    }

    /* Check for unwanted pmd threads */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (pmd->core_id == NON_PMD_CORE_ID) {
            continue;
        }
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
                                                    pmd->core_id)) {
            hmapx_add(&to_delete, pmd);
        } else if (need_to_adjust_static_tx_qids) {
            pmd->need_reload = true;
        }
    }

    HMAPX_FOR_EACH (node, &to_delete) {
        pmd = (struct dp_netdev_pmd_thread *) node->data;
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
                  pmd->numa_id, pmd->core_id);
        dp_netdev_del_pmd(dp, pmd);
    }
    changed = !hmapx_is_empty(&to_delete);
    hmapx_destroy(&to_delete);

    if (need_to_adjust_static_tx_qids) {
        /* 'static_tx_qid's are not sequential now.
         * Reload remaining threads to fix this. */
        reload_affected_pmds(dp);
    }

    /* Check for required new pmd threads */
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
        pmd = dp_netdev_get_pmd(dp, core->core_id);
        if (!pmd) {
            pmd = xzalloc(sizeof *pmd);
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
            pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
                      pmd->numa_id, pmd->core_id);
            changed = true;
        } else {
            dp_netdev_pmd_unref(pmd);
        }
    }

    if (changed) {
        struct ovs_numa_info_numa *numa;

        /* Log the number of pmd threads per numa node. */
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
                      numa->n_cores, numa->numa_id);
        }
    }

    ovs_numa_dump_destroy(pmd_cores);
}

static void
pmd_remove_stale_ports(struct dp_netdev *dp,
                       struct dp_netdev_pmd_thread *pmd)
    OVS_EXCLUDED(pmd->port_mutex)
    OVS_REQUIRES(dp->port_mutex)
{
    struct rxq_poll *poll, *poll_next;
    struct tx_port *tx, *tx_next;

    ovs_mutex_lock(&pmd->port_mutex);
    HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
        struct dp_netdev_port *port = poll->rxq->port;

        if (port->need_reconfigure
            || !hmap_contains(&dp->ports, &port->node)) {
            dp_netdev_del_rxq_from_pmd(pmd, poll);
        }
    }
    HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
        struct dp_netdev_port *port = tx->port;

        if (port->need_reconfigure
            || !hmap_contains(&dp->ports, &port->node)) {
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
        }
    }
    ovs_mutex_unlock(&pmd->port_mutex);
}

/* Must be called each time a port is added/removed or the cmask changes.
 * This creates and destroys pmd threads, reconfigures ports, opens their
 * rxqs and assigns all rxqs/txqs to pmd threads. */
static void
reconfigure_datapath(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_port *port;
    int wanted_txqs;

    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);

    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
     * on the system and the user configuration. */
    reconfigure_pmd_threads(dp);

    wanted_txqs = cmap_count(&dp->poll_threads);

    /* The number of pmd threads might have changed, or a port can be new:
     * adjust the txqs. */
    HMAP_FOR_EACH (port, node, &dp->ports) {
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
    }

    /* Step 2: Remove from the pmd threads ports that have been removed or
     * need reconfiguration. */

    /* Check for all the ports that need reconfiguration.  We cache this in
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
     * change at any time. */
    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_reconf_required(port->netdev)) {
            port->need_reconfigure = true;
        }
    }

    /* Remove from the pmd threads all the ports that have been deleted or
     * need reconfiguration. */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        pmd_remove_stale_ports(dp, pmd);
    }

    /* Reload affected pmd threads.  We must wait for the pmd threads before
     * reconfiguring the ports, because a port cannot be reconfigured while
     * it's being used. */
    reload_affected_pmds(dp);

    /* Step 3: Reconfigure ports. */

    /* We only reconfigure the ports that we determined above, because they're
     * not being used by any pmd thread at the moment.  If a port fails to
     * reconfigure we remove it from the datapath. */
    struct dp_netdev_port *next_port;
    HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
        int err;

        if (!port->need_reconfigure) {
            continue;
        }

        err = port_reconfigure(port);
        if (err) {
            hmap_remove(&dp->ports, &port->node);
            seq_change(dp->port_seq);
            port_destroy(port);
        } else {
            port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
        }
    }

    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
     * for now, we just update the 'pmd' pointer in each rxq to point to the
     * wanted thread according to the scheduling policy. */

    /* Reset all the pmd threads to non isolated. */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        pmd->isolated = false;
    }

    /* Reset all the queues to unassigned */
    HMAP_FOR_EACH (port, node, &dp->ports) {
        for (int i = 0; i < port->n_rxq; i++) {
            port->rxqs[i].pmd = NULL;
        }
    }

    /* Add pinned queues and mark pmd threads isolated. */
    rxq_scheduling(dp, true);

    /* Add non-pinned queues. */
    rxq_scheduling(dp, false);

    /* Step 5: Remove queues not compliant with new scheduling. */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        struct rxq_poll *poll, *poll_next;

        ovs_mutex_lock(&pmd->port_mutex);
        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
            if (poll->rxq->pmd != pmd) {
                dp_netdev_del_rxq_from_pmd(pmd, poll);
            }
        }
        ovs_mutex_unlock(&pmd->port_mutex);
    }

    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
     * the old queues before readding them, otherwise a queue can be polled by
     * two threads at the same time. */
    reload_affected_pmds(dp);

    /* Step 6: Add queues from scheduling, if they're not there already. */
    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (!netdev_is_pmd(port->netdev)) {
            continue;
        }

        for (int qid = 0; qid < port->n_rxq; qid++) {
            struct dp_netdev_rxq *q = &port->rxqs[qid];

            if (q->pmd) {
                ovs_mutex_lock(&q->pmd->port_mutex);
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
                ovs_mutex_unlock(&q->pmd->port_mutex);
            }
        }
    }

    /* Add every port to the tx cache of every pmd thread, if it's not
     * there already and if this pmd has at least one rxq to poll. */
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        ovs_mutex_lock(&pmd->port_mutex);
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
            HMAP_FOR_EACH (port, node, &dp->ports) {
                dp_netdev_add_port_tx_to_pmd(pmd, port);
            }
        }
        ovs_mutex_unlock(&pmd->port_mutex);
    }

    /* Reload affected pmd threads. */
    reload_affected_pmds(dp);

    /* Check if PMD Auto LB is to be enabled */
    set_pmd_auto_lb(dp);
}

/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
static bool
ports_require_restart(const struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (netdev_is_reconf_required(port->netdev)) {
            return true;
        }
    }

    return false;
}

/* Calculates variance in the values stored in array 'a'. 'n' is the number
 * of elements in array to be considered for calculating vairance.
 * Usage example: data array 'a' contains the processing load of each pmd and
 * 'n' is the number of PMDs. It returns the variance in processing load of
 * PMDs*/
static uint64_t
variance(uint64_t a[], int n)
{
    /* Compute mean (average of elements). */
    uint64_t sum = 0;
    uint64_t mean = 0;
    uint64_t sqDiff = 0;

    if (!n) {
        return 0;
    }

    for (int i = 0; i < n; i++) {
        sum += a[i];
    }

    if (sum) {
        mean = sum / n;

        /* Compute sum squared differences with mean. */
        for (int i = 0; i < n; i++) {
            sqDiff += (a[i] - mean)*(a[i] - mean);
        }
    }
    return (sqDiff ? (sqDiff / n) : 0);
}


/* Returns the variance in the PMDs usage as part of dry run of rxqs
 * assignment to PMDs. */
static bool
get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
                     uint32_t num_pmds, uint64_t *predicted_variance)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_port *port;
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_rxq **rxqs = NULL;
    struct rr_numa *numa = NULL;
    struct rr_numa_list rr;
    int n_rxqs = 0;
    bool ret = false;
    uint64_t *pmd_usage;

    if (!predicted_variance) {
        return ret;
    }

    pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));

    HMAP_FOR_EACH (port, node, &dp->ports) {
        if (!netdev_is_pmd(port->netdev)) {
            continue;
        }

        for (int qid = 0; qid < port->n_rxq; qid++) {
            struct dp_netdev_rxq *q = &port->rxqs[qid];
            uint64_t cycle_hist = 0;

            if (q->pmd->isolated) {
                continue;
            }

            if (n_rxqs == 0) {
                rxqs = xmalloc(sizeof *rxqs);
            } else {
                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
            }

            /* Sum the queue intervals and store the cycle history. */
            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
            }
            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
                                         cycle_hist);
            /* Store the queue. */
            rxqs[n_rxqs++] = q;
        }
    }
    if (n_rxqs > 1) {
        /* Sort the queues in order of the processing cycles
         * they consumed during their last pmd interval. */
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
    }
    rr_numa_list_populate(dp, &rr);

    for (int i = 0; i < n_rxqs; i++) {
        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
        numa = rr_numa_list_lookup(&rr, numa_id);
        if (!numa) {
            /* Abort if cross NUMA polling. */
            VLOG_DBG("PMD auto lb dry run."
                     " Aborting due to cross-numa polling.");
            goto cleanup;
        }

        pmd = rr_numa_get_pmd(numa, true);
        VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
                  "to be assigned port \'%s\' rx queue %d "
                  "(measured processing cycles %"PRIu64").",
                  pmd->core_id, numa_id,
                  netdev_rxq_get_name(rxqs[i]->rx),
                  netdev_rxq_get_queue_id(rxqs[i]->rx),
                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));

        for (int id = 0; id < num_pmds; id++) {
            if (pmd->core_id == core_list[id]) {
                /* Add the processing cycles of rxq to pmd polling it. */
                pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
                                        RXQ_CYCLES_PROC_HIST);
            }
        }
    }

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        uint64_t total_cycles = 0;

        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
            continue;
        }

        /* Get the total pmd cycles for an interval. */
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
        /* Estimate the cycles to cover all intervals. */
        total_cycles *= PMD_RXQ_INTERVAL_MAX;
        for (int id = 0; id < num_pmds; id++) {
            if (pmd->core_id == core_list[id]) {
                if (pmd_usage[id]) {
                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
                }
                VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
                         "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
            }
        }
    }
    *predicted_variance = variance(pmd_usage, num_pmds);
    ret = true;

cleanup:
    rr_numa_list_destroy(&rr);
    free(rxqs);
    free(pmd_usage);
    return ret;
}

/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
 * better distribution of load on PMDs. */
static bool
pmd_rebalance_dry_run(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_pmd_thread *pmd;
    uint64_t *curr_pmd_usage;

    uint64_t curr_variance;
    uint64_t new_variance;
    uint64_t improvement = 0;
    uint32_t num_pmds;
    uint32_t *pmd_corelist;
    struct rxq_poll *poll, *poll_next;
    bool ret;

    num_pmds = cmap_count(&dp->poll_threads);

    if (num_pmds > 1) {
        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
    } else {
        return false;
    }

    num_pmds = 0;
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        uint64_t total_cycles = 0;
        uint64_t total_proc = 0;

        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
            continue;
        }

        /* Get the total pmd cycles for an interval. */
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
        /* Estimate the cycles to cover all intervals. */
        total_cycles *= PMD_RXQ_INTERVAL_MAX;

        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
            uint64_t proc_cycles = 0;
            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
            }
            total_proc += proc_cycles;
        }
        if (total_proc) {
            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
        }

        VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
                  pmd->core_id, curr_pmd_usage[num_pmds]);

        if (atomic_count_get(&pmd->pmd_overloaded)) {
            atomic_count_set(&pmd->pmd_overloaded, 0);
        }

        pmd_corelist[num_pmds] = pmd->core_id;
        num_pmds++;
    }

    curr_variance = variance(curr_pmd_usage, num_pmds);
    ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);

    if (ret) {
        VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
                  " Predicted PMD variance: %"PRIu64"",
                  curr_variance, new_variance);

        if (new_variance < curr_variance) {
            improvement =
                ((curr_variance - new_variance) * 100) / curr_variance;
        }
        if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
            ret = false;
        }
    }

    free(curr_pmd_usage);
    free(pmd_corelist);
    return ret;
}


/* Return true if needs to revalidate datapath flows. */
static bool
dpif_netdev_run(struct dpif *dpif)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_pmd_thread *non_pmd;
    uint64_t new_tnl_seq;
    bool need_to_flush = true;
    bool pmd_rebalance = false;
    long long int now = time_msec();
    struct dp_netdev_pmd_thread *pmd;

    ovs_mutex_lock(&dp->port_mutex);
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
    if (non_pmd) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
        HMAP_FOR_EACH (port, node, &dp->ports) {
            if (!netdev_is_pmd(port->netdev)) {
                int i;

                if (port->emc_enabled) {
                    atomic_read_relaxed(&dp->emc_insert_min,
                                        &non_pmd->ctx.emc_insert_min);
                } else {
                    non_pmd->ctx.emc_insert_min = 0;
                }

                for (i = 0; i < port->n_rxq; i++) {
                    if (dp_netdev_process_rxq_port(non_pmd,
                                                   &port->rxqs[i],
                                                   port->port_no)) {
                        need_to_flush = false;
                    }
                }
            }
        }
        if (need_to_flush) {
            /* We didn't receive anything in the process loop.
             * Check if we need to send something.
             * There was no time updates on current iteration. */
            pmd_thread_ctx_time_update(non_pmd);
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
        }

        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
        ovs_mutex_unlock(&dp->non_pmd_mutex);

        dp_netdev_pmd_unref(non_pmd);
    }

    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
    if (pmd_alb->is_enabled) {
        if (!pmd_alb->rebalance_poll_timer) {
            pmd_alb->rebalance_poll_timer = now;
        } else if ((pmd_alb->rebalance_poll_timer +
                   pmd_alb->rebalance_intvl) < now) {
            pmd_alb->rebalance_poll_timer = now;
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
                if (atomic_count_get(&pmd->pmd_overloaded) >=
                                    PMD_RXQ_INTERVAL_MAX) {
                    pmd_rebalance = true;
                    break;
                }
            }

            if (pmd_rebalance &&
                !dp_netdev_is_reconf_required(dp) &&
                !ports_require_restart(dp) &&
                pmd_rebalance_dry_run(dp)) {
                VLOG_INFO("PMD auto lb dry run."
                          " requesting datapath reconfigure.");
                dp_netdev_request_reconfigure(dp);
            }
        }
    }

    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
        reconfigure_datapath(dp);
    }
    ovs_mutex_unlock(&dp->port_mutex);

    tnl_neigh_cache_run();
    tnl_port_map_run();
    new_tnl_seq = seq_read(tnl_conf_seq);

    if (dp->last_tnl_conf_seq != new_tnl_seq) {
        dp->last_tnl_conf_seq = new_tnl_seq;
        return true;
    }
    return false;
}

static void
dpif_netdev_wait(struct dpif *dpif)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp = get_dp_netdev(dpif);

    ovs_mutex_lock(&dp_netdev_mutex);
    ovs_mutex_lock(&dp->port_mutex);
    HMAP_FOR_EACH (port, node, &dp->ports) {
        netdev_wait_reconf_required(port->netdev);
        if (!netdev_is_pmd(port->netdev)) {
            int i;

            for (i = 0; i < port->n_rxq; i++) {
                netdev_rxq_wait(port->rxqs[i].rx);
            }
        }
    }
    ovs_mutex_unlock(&dp->port_mutex);
    ovs_mutex_unlock(&dp_netdev_mutex);
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
}

static void
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
{
    struct tx_port *tx_port_cached;

    /* Flush all the queued packets. */
    dp_netdev_pmd_flush_output_packets(pmd, true);
    /* Free all used tx queue ids. */
    dpif_netdev_xps_revalidate_pmd(pmd, true);

    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
        free(tx_port_cached);
    }
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
        free(tx_port_cached);
    }
}

/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
 * one txq. */
static void
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
    OVS_REQUIRES(pmd->port_mutex)
{
    struct tx_port *tx_port, *tx_port_cached;

    pmd_free_cached_ports(pmd);
    hmap_shrink(&pmd->send_port_cache);
    hmap_shrink(&pmd->tnl_port_cache);

    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
                        hash_port_no(tx_port_cached->port->port_no));
        }

        if (netdev_n_txq(tx_port->port->netdev)) {
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
                        hash_port_no(tx_port_cached->port->port_no));
        }
    }
}

static void
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
{
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
    }
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);

    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
}

static void
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
{
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
}

static int
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
                          struct polled_queue **ppoll_list)
{
    struct polled_queue *poll_list = *ppoll_list;
    struct rxq_poll *poll;
    int i;

    ovs_mutex_lock(&pmd->port_mutex);
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
                                    * sizeof *poll_list);

    i = 0;
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
        poll_list[i].rxq = poll->rxq;
        poll_list[i].port_no = poll->rxq->port->port_no;
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
        i++;
    }

    pmd_load_cached_ports(pmd);

    ovs_mutex_unlock(&pmd->port_mutex);

    *ppoll_list = poll_list;
    return i;
}

static void *
pmd_thread_main(void *f_)
{
    struct dp_netdev_pmd_thread *pmd = f_;
    struct pmd_perf_stats *s = &pmd->perf_stats;
    unsigned int lc = 0;
    struct polled_queue *poll_list;
    bool exiting;
    int poll_cnt;
    int i;
    int process_packets = 0;

    poll_list = NULL;

    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
    ovs_numa_thread_setaffinity_core(pmd->core_id);
    dpdk_set_lcore_id(pmd->core_id);
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
    dfc_cache_init(&pmd->flow_cache);
reload:
    pmd_alloc_static_tx_qid(pmd);

    atomic_count_init(&pmd->pmd_overloaded, 0);

    /* List port/core affinity */
    for (i = 0; i < poll_cnt; i++) {
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
       /* Reset the rxq current cycles counter. */
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
    }

    if (!poll_cnt) {
        while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
            seq_wait(pmd->reload_seq, pmd->last_reload_seq);
            poll_block();
        }
        lc = UINT_MAX;
    }

    pmd->intrvl_tsc_prev = 0;
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
    cycles_counter_update(s);
    /* Protect pmd stats from external clearing while polling. */
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
    for (;;) {
        uint64_t rx_packets = 0, tx_packets = 0;

        pmd_perf_start_iteration(s);

        for (i = 0; i < poll_cnt; i++) {

            if (poll_list[i].emc_enabled) {
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
                                    &pmd->ctx.emc_insert_min);
            } else {
                pmd->ctx.emc_insert_min = 0;
            }

            process_packets =
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
                                           poll_list[i].port_no);
            rx_packets += process_packets;
        }

        if (!rx_packets) {
            /* We didn't receive anything in the process loop.
             * Check if we need to send something.
             * There was no time updates on current iteration. */
            pmd_thread_ctx_time_update(pmd);
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
        }

        if (lc++ > 1024) {
            bool reload;

            lc = 0;

            coverage_try_clear();
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
            if (!ovsrcu_try_quiesce()) {
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
            }

            atomic_read_relaxed(&pmd->reload, &reload);
            if (reload) {
                break;
            }
        }
        pmd_perf_end_iteration(s, rx_packets, tx_packets,
                               pmd_perf_metrics_enabled(pmd));
    }
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);

    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
    exiting = latch_is_set(&pmd->exit_latch);
    /* Signal here to make sure the pmd finishes
     * reloading the updated configuration. */
    dp_netdev_pmd_reload_done(pmd);

    pmd_free_static_tx_qid(pmd);

    if (!exiting) {
        goto reload;
    }

    dfc_cache_uninit(&pmd->flow_cache);
    free(poll_list);
    pmd_free_cached_ports(pmd);
    return NULL;
}

static void
dp_netdev_disable_upcall(struct dp_netdev *dp)
    OVS_ACQUIRES(dp->upcall_rwlock)
{
    fat_rwlock_wrlock(&dp->upcall_rwlock);
}


/* Meters */
static void
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
                               struct ofputil_meter_features *features)
{
    features->max_meters = MAX_METERS;
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
    features->max_bands = MAX_BANDS;
    features->max_color = 0;
}

/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
 * that exceed a band are dropped in-place. */
static void
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
                    uint32_t meter_id, long long int now)
{
    struct dp_meter *meter;
    struct dp_meter_band *band;
    struct dp_packet *packet;
    long long int long_delta_t; /* msec */
    uint32_t delta_t; /* msec */
    const size_t cnt = dp_packet_batch_size(packets_);
    uint32_t bytes, volume;
    int exceeded_band[NETDEV_MAX_BURST];
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
    int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */

    if (meter_id >= MAX_METERS) {
        return;
    }

    meter_lock(dp, meter_id);
    meter = dp->meters[meter_id];
    if (!meter) {
        goto out;
    }

    /* Initialize as negative values. */
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
    /* Initialize as zeroes. */
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);

    /* All packets will hit the meter at the same time. */
    long_delta_t = (now - meter->used) / 1000; /* msec */

    /* Make sure delta_t will not be too large, so that bucket will not
     * wrap around below. */
    delta_t = (long_delta_t > (long long int)meter->max_delta_t)
        ? meter->max_delta_t : (uint32_t)long_delta_t;

    /* Update meter stats. */
    meter->used = now;
    meter->packet_count += cnt;
    bytes = 0;
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
        bytes += dp_packet_size(packet);
    }
    meter->byte_count += bytes;

    /* Meters can operate in terms of packets per second or kilobits per
     * second. */
    if (meter->flags & OFPMF13_PKTPS) {
        /* Rate in packets/second, bucket 1/1000 packets. */
        /* msec * packets/sec = 1/1000 packets. */
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
    } else {
        /* Rate in kbps, bucket in bits. */
        /* msec * kbps = bits */
        volume = bytes * 8;
    }

    /* Update all bands and find the one hit with the highest rate for each
     * packet (if any). */
    for (int m = 0; m < meter->n_bands; ++m) {
        band = &meter->bands[m];

        /* Update band's bucket. */
        band->bucket += delta_t * band->up.rate;
        if (band->bucket > band->up.burst_size) {
            band->bucket = band->up.burst_size;
        }

        /* Drain the bucket for all the packets, if possible. */
        if (band->bucket >= volume) {
            band->bucket -= volume;
        } else {
            int band_exceeded_pkt;

            /* Band limit hit, must process packet-by-packet. */
            if (meter->flags & OFPMF13_PKTPS) {
                band_exceeded_pkt = band->bucket / 1000;
                band->bucket %= 1000; /* Remainder stays in bucket. */

                /* Update the exceeding band for each exceeding packet.
                 * (Only one band will be fired by a packet, and that
                 * can be different for each packet.) */
                for (int i = band_exceeded_pkt; i < cnt; i++) {
                    if (band->up.rate > exceeded_rate[i]) {
                        exceeded_rate[i] = band->up.rate;
                        exceeded_band[i] = m;
                    }
                }
            } else {
                /* Packet sizes differ, must process one-by-one. */
                band_exceeded_pkt = cnt;
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
                    uint32_t bits = dp_packet_size(packet) * 8;

                    if (band->bucket >= bits) {
                        band->bucket -= bits;
                    } else {
                        if (i < band_exceeded_pkt) {
                            band_exceeded_pkt = i;
                        }
                        /* Update the exceeding band for the exceeding packet.
                         * (Only one band will be fired by a packet, and that
                         * can be different for each packet.) */
                        if (band->up.rate > exceeded_rate[i]) {
                            exceeded_rate[i] = band->up.rate;
                            exceeded_band[i] = m;
                        }
                    }
                }
            }
            /* Remember the first exceeding packet. */
            if (exceeded_pkt > band_exceeded_pkt) {
                exceeded_pkt = band_exceeded_pkt;
            }
        }
    }

    /* Fire the highest rate band exceeded by each packet, and drop
     * packets if needed. */
    size_t j;
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
        if (exceeded_band[j] >= 0) {
            /* Meter drop packet. */
            band = &meter->bands[exceeded_band[j]];
            band->packet_count += 1;
            band->byte_count += dp_packet_size(packet);

            dp_packet_delete(packet);
        } else {
            /* Meter accepts packet. */
            dp_packet_batch_refill(packets_, packet, j);
        }
    }
 out:
    meter_unlock(dp, meter_id);
}

/* Meter set/get/del processing is still single-threaded. */
static int
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
                      struct ofputil_meter_config *config)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    uint32_t mid = meter_id.uint32;
    struct dp_meter *meter;
    int i;

    if (mid >= MAX_METERS) {
        return EFBIG; /* Meter_id out of range. */
    }

    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
        return EBADF; /* Unsupported flags set */
    }

    if (config->n_bands > MAX_BANDS) {
        return EINVAL;
    }

    for (i = 0; i < config->n_bands; ++i) {
        switch (config->bands[i].type) {
        case OFPMBT13_DROP:
            break;
        default:
            return ENODEV; /* Unsupported band type */
        }
    }

    /* Allocate meter */
    meter = xzalloc(sizeof *meter
                    + config->n_bands * sizeof(struct dp_meter_band));

    meter->flags = config->flags;
    meter->n_bands = config->n_bands;
    meter->max_delta_t = 0;
    meter->used = time_usec();

    /* set up bands */
    for (i = 0; i < config->n_bands; ++i) {
        uint32_t band_max_delta_t;

        /* Set burst size to a workable value if none specified. */
        if (config->bands[i].burst_size == 0) {
            config->bands[i].burst_size = config->bands[i].rate;
        }

        meter->bands[i].up = config->bands[i];
        /* Convert burst size to the bucket units: */
        /* pkts => 1/1000 packets, kilobits => bits. */
        meter->bands[i].up.burst_size *= 1000;
        /* Initialize bucket to empty. */
        meter->bands[i].bucket = 0;

        /* Figure out max delta_t that is enough to fill any bucket. */
        band_max_delta_t
            = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
        if (band_max_delta_t > meter->max_delta_t) {
            meter->max_delta_t = band_max_delta_t;
        }
    }

    meter_lock(dp, mid);
    dp_delete_meter(dp, mid); /* Free existing meter, if any */
    dp->meters[mid] = meter;
    meter_unlock(dp, mid);

    return 0;
}

static int
dpif_netdev_meter_get(const struct dpif *dpif,
                      ofproto_meter_id meter_id_,
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
{
    const struct dp_netdev *dp = get_dp_netdev(dpif);
    uint32_t meter_id = meter_id_.uint32;
    int retval = 0;

    if (meter_id >= MAX_METERS) {
        return EFBIG;
    }

    meter_lock(dp, meter_id);
    const struct dp_meter *meter = dp->meters[meter_id];
    if (!meter) {
        retval = ENOENT;
        goto done;
    }
    if (stats) {
        int i = 0;

        stats->packet_in_count = meter->packet_count;
        stats->byte_in_count = meter->byte_count;

        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
            stats->bands[i].packet_count = meter->bands[i].packet_count;
            stats->bands[i].byte_count = meter->bands[i].byte_count;
        }

        stats->n_bands = i;
    }

done:
    meter_unlock(dp, meter_id);
    return retval;
}

static int
dpif_netdev_meter_del(struct dpif *dpif,
                      ofproto_meter_id meter_id_,
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    int error;

    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
    if (!error) {
        uint32_t meter_id = meter_id_.uint32;

        meter_lock(dp, meter_id);
        dp_delete_meter(dp, meter_id);
        meter_unlock(dp, meter_id);
    }
    return error;
}


static void
dpif_netdev_disable_upcall(struct dpif *dpif)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp_netdev_disable_upcall(dp);
}

static void
dp_netdev_enable_upcall(struct dp_netdev *dp)
    OVS_RELEASES(dp->upcall_rwlock)
{
    fat_rwlock_unlock(&dp->upcall_rwlock);
}

static void
dpif_netdev_enable_upcall(struct dpif *dpif)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp_netdev_enable_upcall(dp);
}

static void
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
{
    ovs_mutex_lock(&pmd->cond_mutex);
    atomic_store_relaxed(&pmd->reload, false);
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
    xpthread_cond_signal(&pmd->cond);
    ovs_mutex_unlock(&pmd->cond_mutex);
}

/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
 * 'core_id' is NON_PMD_CORE_ID).
 *
 * Caller must unrefs the returned reference.  */
static struct dp_netdev_pmd_thread *
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
{
    struct dp_netdev_pmd_thread *pmd;
    const struct cmap_node *pnode;

    pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
    if (!pnode) {
        return NULL;
    }
    pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);

    return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
}

/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
static void
dp_netdev_set_nonpmd(struct dp_netdev *dp)
    OVS_REQUIRES(dp->port_mutex)
{
    struct dp_netdev_pmd_thread *non_pmd;

    non_pmd = xzalloc(sizeof *non_pmd);
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
}

/* Caller must have valid pointer to 'pmd'. */
static bool
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
{
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
}

static void
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
{
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
    }
}

/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
 * fails, keeps checking for next node until reaching the end of cmap.
 *
 * Caller must unrefs the returned reference. */
static struct dp_netdev_pmd_thread *
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
{
    struct dp_netdev_pmd_thread *next;

    do {
        struct cmap_node *node;

        node = cmap_next_position(&dp->poll_threads, pos);
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
            : NULL;
    } while (next && !dp_netdev_pmd_try_ref(next));

    return next;
}

/* Configures the 'pmd' based on the input argument. */
static void
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
                        unsigned core_id, int numa_id)
{
    pmd->dp = dp;
    pmd->core_id = core_id;
    pmd->numa_id = numa_id;
    pmd->need_reload = false;
    pmd->n_output_batches = 0;

    ovs_refcount_init(&pmd->ref_cnt);
    latch_init(&pmd->exit_latch);
    pmd->reload_seq = seq_create();
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
    atomic_init(&pmd->reload, false);
    xpthread_cond_init(&pmd->cond, NULL);
    ovs_mutex_init(&pmd->cond_mutex);
    ovs_mutex_init(&pmd->flow_mutex);
    ovs_mutex_init(&pmd->port_mutex);
    cmap_init(&pmd->flow_table);
    cmap_init(&pmd->classifiers);
    pmd->ctx.last_rxq = NULL;
    pmd_thread_ctx_time_update(pmd);
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
    pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
    hmap_init(&pmd->poll_list);
    hmap_init(&pmd->tx_ports);
    hmap_init(&pmd->tnl_port_cache);
    hmap_init(&pmd->send_port_cache);
    /* init the 'flow_cache' since there is no
     * actual thread created for NON_PMD_CORE_ID. */
    if (core_id == NON_PMD_CORE_ID) {
        dfc_cache_init(&pmd->flow_cache);
        pmd_alloc_static_tx_qid(pmd);
    }
    pmd_perf_stats_init(&pmd->perf_stats);
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
                hash_int(core_id, 0));
}

static void
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
{
    struct dpcls *cls;

    dp_netdev_pmd_flow_flush(pmd);
    hmap_destroy(&pmd->send_port_cache);
    hmap_destroy(&pmd->tnl_port_cache);
    hmap_destroy(&pmd->tx_ports);
    hmap_destroy(&pmd->poll_list);
    /* All flows (including their dpcls_rules) have been deleted already */
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
        dpcls_destroy(cls);
        ovsrcu_postpone(free, cls);
    }
    cmap_destroy(&pmd->classifiers);
    cmap_destroy(&pmd->flow_table);
    ovs_mutex_destroy(&pmd->flow_mutex);
    latch_destroy(&pmd->exit_latch);
    seq_destroy(pmd->reload_seq);
    xpthread_cond_destroy(&pmd->cond);
    ovs_mutex_destroy(&pmd->cond_mutex);
    ovs_mutex_destroy(&pmd->port_mutex);
    free(pmd);
}

/* Stops the pmd thread, removes it from the 'dp->poll_threads',
 * and unrefs the struct. */
static void
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
{
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
     * but extra cleanup is necessary */
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_mutex_lock(&dp->non_pmd_mutex);
        dfc_cache_uninit(&pmd->flow_cache);
        pmd_free_cached_ports(pmd);
        pmd_free_static_tx_qid(pmd);
        ovs_mutex_unlock(&dp->non_pmd_mutex);
    } else {
        latch_set(&pmd->exit_latch);
        dp_netdev_reload_pmd__(pmd);
        xpthread_join(pmd->thread, NULL);
    }

    dp_netdev_pmd_clear_ports(pmd);

    /* Purges the 'pmd''s flows after stopping the thread, but before
     * destroying the flows, so that the flow stats can be collected. */
    if (dp->dp_purge_cb) {
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
    }
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
    dp_netdev_pmd_unref(pmd);
}

/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
 * thread. */
static void
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
{
    struct dp_netdev_pmd_thread *pmd;
    struct dp_netdev_pmd_thread **pmd_list;
    size_t k = 0, n_pmds;

    n_pmds = cmap_count(&dp->poll_threads);
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);

    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
            continue;
        }
        /* We cannot call dp_netdev_del_pmd(), since it alters
         * 'dp->poll_threads' (while we're iterating it) and it
         * might quiesce. */
        ovs_assert(k < n_pmds);
        pmd_list[k++] = pmd;
    }

    for (size_t i = 0; i < k; i++) {
        dp_netdev_del_pmd(dp, pmd_list[i]);
    }
    free(pmd_list);
}

/* Deletes all rx queues from pmd->poll_list and all the ports from
 * pmd->tx_ports. */
static void
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
{
    struct rxq_poll *poll;
    struct tx_port *port;

    ovs_mutex_lock(&pmd->port_mutex);
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
        free(poll);
    }
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
        free(port);
    }
    ovs_mutex_unlock(&pmd->port_mutex);
}

/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
static void
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
                         struct dp_netdev_rxq *rxq)
    OVS_REQUIRES(pmd->port_mutex)
{
    int qid = netdev_rxq_get_queue_id(rxq->rx);
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
    struct rxq_poll *poll;

    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
        if (poll->rxq == rxq) {
            /* 'rxq' is already polled by this thread. Do nothing. */
            return;
        }
    }

    poll = xmalloc(sizeof *poll);
    poll->rxq = rxq;
    hmap_insert(&pmd->poll_list, &poll->node, hash);

    pmd->need_reload = true;
}

/* Delete 'poll' from poll_list of PMD thread. */
static void
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
                           struct rxq_poll *poll)
    OVS_REQUIRES(pmd->port_mutex)
{
    hmap_remove(&pmd->poll_list, &poll->node);
    free(poll);

    pmd->need_reload = true;
}

/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
 * changes to take effect. */
static void
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
                             struct dp_netdev_port *port)
    OVS_REQUIRES(pmd->port_mutex)
{
    struct tx_port *tx;

    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
    if (tx) {
        /* 'port' is already on this thread tx cache. Do nothing. */
        return;
    }

    tx = xzalloc(sizeof *tx);

    tx->port = port;
    tx->qid = -1;
    tx->flush_time = 0LL;
    dp_packet_batch_init(&tx->output_pkts);

    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
    pmd->need_reload = true;
}

/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
 * changes to take effect. */
static void
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
                               struct tx_port *tx)
    OVS_REQUIRES(pmd->port_mutex)
{
    hmap_remove(&pmd->tx_ports, &tx->node);
    free(tx);
    pmd->need_reload = true;
}

static char *
dpif_netdev_get_datapath_version(void)
{
     return xstrdup("<built-in>");
}

static void
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
                    uint16_t tcp_flags, long long now)
{
    uint16_t flags;

    atomic_store_relaxed(&netdev_flow->stats.used, now);
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
    flags |= tcp_flags;
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
}

static int
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
                 enum dpif_upcall_type type, const struct nlattr *userdata,
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
{
    struct dp_netdev *dp = pmd->dp;

    if (OVS_UNLIKELY(!dp->upcall_cb)) {
        return ENODEV;
    }

    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
        struct ds ds = DS_EMPTY_INITIALIZER;
        char *packet_str;
        struct ofpbuf key;
        struct odp_flow_key_parms odp_parms = {
            .flow = flow,
            .mask = wc ? &wc->masks : NULL,
            .support = dp_netdev_support,
        };

        ofpbuf_init(&key, 0);
        odp_flow_key_from_flow(&odp_parms, &key);
        packet_str = ofp_dp_packet_to_string(packet_);

        odp_flow_key_format(key.data, key.size, &ds);

        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);

        ofpbuf_uninit(&key);
        free(packet_str);

        ds_destroy(&ds);
    }

    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
                         actions, wc, put_actions, dp->upcall_aux);
}

static inline uint32_t
dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
                                const struct miniflow *mf)
{
    uint32_t hash;

    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
        hash = dp_packet_get_rss_hash(packet);
    } else {
        hash = miniflow_hash_5tuple(mf, 0);
        dp_packet_set_rss_hash(packet, hash);
    }

    return hash;
}

static inline uint32_t
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
                                const struct miniflow *mf)
{
    uint32_t hash, recirc_depth;

    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
        hash = dp_packet_get_rss_hash(packet);
    } else {
        hash = miniflow_hash_5tuple(mf, 0);
        dp_packet_set_rss_hash(packet, hash);
    }

    /* The RSS hash must account for the recirculation depth to avoid
     * collisions in the exact match cache */
    recirc_depth = *recirc_depth_get_unsafe();
    if (OVS_UNLIKELY(recirc_depth)) {
        hash = hash_finish(hash, recirc_depth);
        dp_packet_set_rss_hash(packet, hash);
    }
    return hash;
}

struct packet_batch_per_flow {
    unsigned int byte_count;
    uint16_t tcp_flags;
    struct dp_netdev_flow *flow;

    struct dp_packet_batch array;
};

static inline void
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
                             struct dp_packet *packet,
                             uint16_t tcp_flags)
{
    batch->byte_count += dp_packet_size(packet);
    batch->tcp_flags |= tcp_flags;
    batch->array.packets[batch->array.count++] = packet;
}

static inline void
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
                           struct dp_netdev_flow *flow)
{
    flow->batch = batch;

    batch->flow = flow;
    dp_packet_batch_init(&batch->array);
    batch->byte_count = 0;
    batch->tcp_flags = 0;
}

static inline void
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
                              struct dp_netdev_pmd_thread *pmd)
{
    struct dp_netdev_actions *actions;
    struct dp_netdev_flow *flow = batch->flow;

    dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
                        batch->tcp_flags, pmd->ctx.now / 1000);

    actions = dp_netdev_flow_get_actions(flow);

    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
                              actions->actions, actions->size);
}

static inline void
dp_netdev_queue_batches(struct dp_packet *pkt,
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
                        struct packet_batch_per_flow *batches,
                        size_t *n_batches)
{
    struct packet_batch_per_flow *batch = flow->batch;

    if (OVS_UNLIKELY(!batch)) {
        batch = &batches[(*n_batches)++];
        packet_batch_per_flow_init(batch, flow);
    }

    packet_batch_per_flow_update(batch, pkt, tcp_flags);
}

static inline void
packet_enqueue_to_flow_map(struct dp_packet *packet,
                           struct dp_netdev_flow *flow,
                           uint16_t tcp_flags,
                           struct dp_packet_flow_map *flow_map,
                           size_t index)
{
    struct dp_packet_flow_map *map = &flow_map[index];
    map->flow = flow;
    map->packet = packet;
    map->tcp_flags = tcp_flags;
}

/* SMC lookup function for a batch of packets.
 * By doing batching SMC lookup, we can use prefetch
 * to hide memory access latency.
 */
static inline void
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
            struct netdev_flow_key *keys,
            struct netdev_flow_key **missed_keys,
            struct dp_packet_batch *packets_,
            const int cnt,
            struct dp_packet_flow_map *flow_map,
            uint8_t *index_map)
{
    int i;
    struct dp_packet *packet;
    size_t n_smc_hit = 0, n_missed = 0;
    struct dfc_cache *cache = &pmd->flow_cache;
    struct smc_cache *smc_cache = &cache->smc_cache;
    const struct cmap_node *flow_node;
    int recv_idx;
    uint16_t tcp_flags;

    /* Prefetch buckets for all packets */
    for (i = 0; i < cnt; i++) {
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
    }

    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
        struct dp_netdev_flow *flow = NULL;
        flow_node = smc_entry_get(pmd, keys[i].hash);
        bool hit = false;
        /* Get the original order of this packet in received batch. */
        recv_idx = index_map[i];

        if (OVS_LIKELY(flow_node != NULL)) {
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
                /* Since we dont have per-port megaflow to check the port
                 * number, we need to  verify that the input ports match. */
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);

                    /* SMC hit and emc miss, we insert into EMC */
                    keys[i].len =
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
                    emc_probabilistic_insert(pmd, &keys[i], flow);
                    /* Add these packets into the flow map in the same order
                     * as received.
                     */
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
                                               flow_map, recv_idx);
                    n_smc_hit++;
                    hit = true;
                    break;
                }
            }
            if (hit) {
                continue;
            }
        }

        /* SMC missed. Group missed packets together at
         * the beginning of the 'packets' array. */
        dp_packet_batch_refill(packets_, packet, i);

        /* Preserve the order of packet for flow batching. */
        index_map[n_missed] = recv_idx;

        /* Put missed keys to the pointer arrays return to the caller */
        missed_keys[n_missed++] = &keys[i];
    }

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
}

/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
 * miniflow is copied into 'keys' and the packet pointer is moved at the
 * beginning of the 'packets' array. The pointers of missed keys are put in the
 * missed_keys pointer array for future processing.
 *
 * The function returns the number of packets that needs to be processed in the
 * 'packets' array (they have been moved to the beginning of the vector).
 *
 * For performance reasons a caller may choose not to initialize the metadata
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
 * is not valid and must be initialized by this function using 'port_no'.
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
 * will be ignored.
 */
static inline size_t
dfc_processing(struct dp_netdev_pmd_thread *pmd,
               struct dp_packet_batch *packets_,
               struct netdev_flow_key *keys,
               struct netdev_flow_key **missed_keys,
               struct packet_batch_per_flow batches[], size_t *n_batches,
               struct dp_packet_flow_map *flow_map,
               size_t *n_flows, uint8_t *index_map,
               bool md_is_valid, odp_port_t port_no)
{
    struct netdev_flow_key *key = &keys[0];
    size_t n_missed = 0, n_emc_hit = 0;
    struct dfc_cache *cache = &pmd->flow_cache;
    struct dp_packet *packet;
    const size_t cnt = dp_packet_batch_size(packets_);
    uint32_t cur_min = pmd->ctx.emc_insert_min;
    int i;
    uint16_t tcp_flags;
    bool smc_enable_db;
    size_t map_cnt = 0;
    bool batch_enable = true;

    atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
    pmd_perf_update_counter(&pmd->perf_stats,
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
                            cnt);

    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
        struct dp_netdev_flow *flow;
        uint32_t mark;

        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
            dp_packet_delete(packet);
            continue;
        }

        if (i != cnt - 1) {
            struct dp_packet **packets = packets_->packets;
            /* Prefetch next packet data and metadata. */
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
            pkt_metadata_prefetch_init(&packets[i+1]->md);
        }

        if (!md_is_valid) {
            pkt_metadata_init(&packet->md, port_no);
        }

        if ((*recirc_depth_get() == 0) &&
            dp_packet_has_flow_mark(packet, &mark)) {
            flow = mark_to_flow_find(pmd, mark);
            if (OVS_LIKELY(flow)) {
                tcp_flags = parse_tcp_flags(packet);
                if (OVS_LIKELY(batch_enable)) {
                    dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
                                            n_batches);
                } else {
                    /* Flow batching should be performed only after fast-path
                     * processing is also completed for packets with emc miss
                     * or else it will result in reordering of packets with
                     * same datapath flows. */
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
                                               flow_map, map_cnt++);
                }
                continue;
            }
        }

        miniflow_extract(packet, &key->mf);
        key->len = 0; /* Not computed yet. */
        /* If EMC and SMC disabled skip hash computation */
        if (smc_enable_db == true || cur_min != 0) {
            if (!md_is_valid) {
                key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
                        &key->mf);
            } else {
                key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
            }
        }
        if (cur_min) {
            flow = emc_lookup(&cache->emc_cache, key);
        } else {
            flow = NULL;
        }
        if (OVS_LIKELY(flow)) {
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
            n_emc_hit++;
            if (OVS_LIKELY(batch_enable)) {
                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
                                        n_batches);
            } else {
                /* Flow batching should be performed only after fast-path
                 * processing is also completed for packets with emc miss
                 * or else it will result in reordering of packets with
                 * same datapath flows. */
                packet_enqueue_to_flow_map(packet, flow, tcp_flags,
                                           flow_map, map_cnt++);
            }
        } else {
            /* Exact match cache missed. Group missed packets together at
             * the beginning of the 'packets' array. */
            dp_packet_batch_refill(packets_, packet, i);

            /* Preserve the order of packet for flow batching. */
            index_map[n_missed] = map_cnt;
            flow_map[map_cnt++].flow = NULL;

            /* 'key[n_missed]' contains the key of the current packet and it
             * will be passed to SMC lookup. The next key should be extracted
             * to 'keys[n_missed + 1]'.
             * We also maintain a pointer array to keys missed both SMC and EMC
             * which will be returned to the caller for future processing. */
            missed_keys[n_missed] = key;
            key = &keys[++n_missed];

            /* Skip batching for subsequent packets to avoid reordering. */
            batch_enable = false;
        }
    }
    /* Count of packets which are not flow batched. */
    *n_flows = map_cnt;

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);

    if (!smc_enable_db) {
        return dp_packet_batch_size(packets_);
    }

    /* Packets miss EMC will do a batch lookup in SMC if enabled */
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
                     n_missed, flow_map, index_map);

    return dp_packet_batch_size(packets_);
}

static inline int
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
                     struct dp_packet *packet,
                     const struct netdev_flow_key *key,
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
{
    struct ofpbuf *add_actions;
    struct dp_packet_batch b;
    struct match match;
    ovs_u128 ufid;
    int error;
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);

    match.tun_md.valid = false;
    miniflow_expand(&key->mf, &match.flow);

    ofpbuf_clear(actions);
    ofpbuf_clear(put_actions);

    dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
                             &ufid, DPIF_UC_MISS, NULL, actions,
                             put_actions);
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
        dp_packet_delete(packet);
        return error;
    }

    /* The Netlink encoding of datapath flow keys cannot express
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
     * tag is interpreted as exact match on the fact that there is no
     * VLAN.  Unless we refactor a lot of code that translates between
     * Netlink and struct flow representations, we have to do the same
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
    if (!match.wc.masks.vlans[0].tci) {
        match.wc.masks.vlans[0].tci = htons(0xffff);
    }

    /* We can't allow the packet batching in the next loop to execute
     * the actions.  Otherwise, if there are any slow path actions,
     * we'll send the packet up twice. */
    dp_packet_batch_init_packet(&b, packet);
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
                              actions->data, actions->size);

    add_actions = put_actions->size ? put_actions : actions;
    if (OVS_LIKELY(error != ENOSPC)) {
        struct dp_netdev_flow *netdev_flow;

        /* XXX: There's a race window where a flow covering this packet
         * could have already been installed since we last did the flow
         * lookup before upcall.  This could be solved by moving the
         * mutex lock outside the loop, but that's an awful long time
         * to be locking everyone out of making flow installs.  If we
         * move to a per-core classifier, it would be reasonable. */
        ovs_mutex_lock(&pmd->flow_mutex);
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
        if (OVS_LIKELY(!netdev_flow)) {
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
                                             add_actions->data,
                                             add_actions->size);
        }
        ovs_mutex_unlock(&pmd->flow_mutex);
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
        smc_insert(pmd, key, hash);
        emc_probabilistic_insert(pmd, key, netdev_flow);
    }
    if (pmd_perf_metrics_enabled(pmd)) {
        /* Update upcall stats. */
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
        struct pmd_perf_stats *s = &pmd->perf_stats;
        s->current.upcalls++;
        s->current.upcall_cycles += cycles;
        histogram_add_sample(&s->cycles_per_upcall, cycles);
    }
    return error;
}

static inline void
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
                     struct dp_packet_batch *packets_,
                     struct netdev_flow_key **keys,
                     struct dp_packet_flow_map *flow_map,
                     uint8_t *index_map,
                     odp_port_t in_port)
{
    const size_t cnt = dp_packet_batch_size(packets_);
#if !defined(__CHECKER__) && !defined(_WIN32)
    const size_t PKT_ARRAY_SIZE = cnt;
#else
    /* Sparse or MSVC doesn't like variable length array. */
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
    struct dp_packet *packet;
    struct dpcls *cls;
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
    struct dp_netdev *dp = pmd->dp;
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
    int lookup_cnt = 0, add_lookup_cnt;
    bool any_miss;

    for (size_t i = 0; i < cnt; i++) {
        /* Key length is needed in all the cases, hash computed on demand. */
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
    }
    /* Get the classifier for the in_port */
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
    if (OVS_LIKELY(cls)) {
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
                                rules, cnt, &lookup_cnt);
    } else {
        any_miss = true;
        memset(rules, 0, sizeof(rules));
    }
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
        struct ofpbuf actions, put_actions;

        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);

        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
            struct dp_netdev_flow *netdev_flow;

            if (OVS_LIKELY(rules[i])) {
                continue;
            }

            /* It's possible that an earlier slow path execution installed
             * a rule covering this flow.  In this case, it's a lot cheaper
             * to catch it here than execute a miss. */
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
                                                    &add_lookup_cnt);
            if (netdev_flow) {
                lookup_cnt += add_lookup_cnt;
                rules[i] = &netdev_flow->cr;
                continue;
            }

            int error = handle_packet_upcall(pmd, packet, keys[i],
                                             &actions, &put_actions);

            if (OVS_UNLIKELY(error)) {
                upcall_fail_cnt++;
            } else {
                upcall_ok_cnt++;
            }
        }

        ofpbuf_uninit(&actions);
        ofpbuf_uninit(&put_actions);
        fat_rwlock_unlock(&dp->upcall_rwlock);
    } else if (OVS_UNLIKELY(any_miss)) {
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
            if (OVS_UNLIKELY(!rules[i])) {
                dp_packet_delete(packet);
                upcall_fail_cnt++;
            }
        }
    }

    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
        struct dp_netdev_flow *flow;
        /* Get the original order of this packet in received batch. */
        int recv_idx = index_map[i];
        uint16_t tcp_flags;

        if (OVS_UNLIKELY(!rules[i])) {
            continue;
        }

        flow = dp_netdev_flow_cast(rules[i]);
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
        smc_insert(pmd, keys[i], hash);

        emc_probabilistic_insert(pmd, keys[i], flow);
        /* Add these packets into the flow map in the same order
         * as received.
         */
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
                                   flow_map, recv_idx);
    }

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
                            lookup_cnt);
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
                            upcall_ok_cnt);
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
                            upcall_fail_cnt);
}

/* Packets enter the datapath from a port (or from recirculation) here.
 *
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
 * When false the metadata in 'packets' need to be initialized. */
static void
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                  struct dp_packet_batch *packets,
                  bool md_is_valid, odp_port_t port_no)
{
#if !defined(__CHECKER__) && !defined(_WIN32)
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
#else
    /* Sparse or MSVC doesn't like variable length array. */
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
    size_t n_batches;
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
    uint8_t index_map[PKT_ARRAY_SIZE];
    size_t n_flows, i;

    odp_port_t in_port;

    n_batches = 0;
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
                   flow_map, &n_flows, index_map, md_is_valid, port_no);

    if (!dp_packet_batch_is_empty(packets)) {
        /* Get ingress port from first packet's metadata. */
        in_port = packets->packets[0]->md.in_port.odp_port;
        fast_path_processing(pmd, packets, missed_keys,
                             flow_map, index_map, in_port);
    }

    /* Batch rest of packets which are in flow map. */
    for (i = 0; i < n_flows; i++) {
        struct dp_packet_flow_map *map = &flow_map[i];

        if (OVS_UNLIKELY(!map->flow)) {
            continue;
        }
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
                                batches, &n_batches);
     }

    /* All the flow batches need to be reset before any call to
     * packet_batch_per_flow_execute() as it could potentially trigger
     * recirculation. When a packet matching flow ‘j’ happens to be
     * recirculated, the nested call to dp_netdev_input__() could potentially
     * classify the packet as matching another flow - say 'k'. It could happen
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
     * already its own batches[k] still waiting to be served.  So if its
     * ‘batch’ member is not reset, the recirculated packet would be wrongly
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
    for (i = 0; i < n_batches; i++) {
        batches[i].flow->batch = NULL;
    }

    for (i = 0; i < n_batches; i++) {
        packet_batch_per_flow_execute(&batches[i], pmd);
    }
}

static void
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                struct dp_packet_batch *packets,
                odp_port_t port_no)
{
    dp_netdev_input__(pmd, packets, false, port_no);
}

static void
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet_batch *packets)
{
    dp_netdev_input__(pmd, packets, true, 0);
}

struct dp_netdev_execute_aux {
    struct dp_netdev_pmd_thread *pmd;
    const struct flow *flow;
};

static void
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
                                 void *aux)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp->dp_purge_aux = aux;
    dp->dp_purge_cb = cb;
}

static void
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
                               void *aux)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    dp->upcall_aux = aux;
    dp->upcall_cb = cb;
}

static void
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                               bool purge)
{
    struct tx_port *tx;
    struct dp_netdev_port *port;
    long long interval;

    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
        if (!tx->port->dynamic_txqs) {
            continue;
        }
        interval = pmd->ctx.now - tx->last_used;
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
            port = tx->port;
            ovs_mutex_lock(&port->txq_used_mutex);
            port->txq_used[tx->qid]--;
            ovs_mutex_unlock(&port->txq_used_mutex);
            tx->qid = -1;
        }
    }
}

static int
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
                           struct tx_port *tx)
{
    struct dp_netdev_port *port;
    long long interval;
    int i, min_cnt, min_qid;

    interval = pmd->ctx.now - tx->last_used;
    tx->last_used = pmd->ctx.now;

    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
        return tx->qid;
    }

    port = tx->port;

    ovs_mutex_lock(&port->txq_used_mutex);
    if (tx->qid >= 0) {
        port->txq_used[tx->qid]--;
        tx->qid = -1;
    }

    min_cnt = -1;
    min_qid = 0;
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
            min_cnt = port->txq_used[i];
            min_qid = i;
        }
    }

    port->txq_used[min_qid]++;
    tx->qid = min_qid;

    ovs_mutex_unlock(&port->txq_used_mutex);

    dpif_netdev_xps_revalidate_pmd(pmd, false);

    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
    return min_qid;
}

static struct tx_port *
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
                          odp_port_t port_no)
{
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
}

static struct tx_port *
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
                           odp_port_t port_no)
{
    return tx_port_lookup(&pmd->send_port_cache, port_no);
}

static int
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
                const struct nlattr *attr,
                struct dp_packet_batch *batch)
{
    struct tx_port *tun_port;
    const struct ovs_action_push_tnl *data;
    int err;

    data = nl_attr_get(attr);

    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
    if (!tun_port) {
        err = -EINVAL;
        goto error;
    }
    err = netdev_push_header(tun_port->port->netdev, batch, data);
    if (!err) {
        return 0;
    }
error:
    dp_packet_delete_batch(batch, true);
    return err;
}

static void
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
                            struct dp_packet *packet, bool should_steal,
                            struct flow *flow, ovs_u128 *ufid,
                            struct ofpbuf *actions,
                            const struct nlattr *userdata)
{
    struct dp_packet_batch b;
    int error;

    ofpbuf_clear(actions);

    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
                             DPIF_UC_ACTION, userdata, actions,
                             NULL);
    if (!error || error == ENOSPC) {
        dp_packet_batch_init_packet(&b, packet);
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
                                  actions->data, actions->size);
    } else if (should_steal) {
        dp_packet_delete(packet);
    }
}

static void
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
              const struct nlattr *a, bool should_steal)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    struct dp_netdev_execute_aux *aux = aux_;
    uint32_t *depth = recirc_depth_get();
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
    struct dp_netdev *dp = pmd->dp;
    int type = nl_attr_type(a);
    struct tx_port *p;

    switch ((enum ovs_action_attr)type) {
    case OVS_ACTION_ATTR_OUTPUT:
        p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
        if (OVS_LIKELY(p)) {
            struct dp_packet *packet;
            struct dp_packet_batch out;

            if (!should_steal) {
                dp_packet_batch_clone(&out, packets_);
                dp_packet_batch_reset_cutlen(packets_);
                packets_ = &out;
            }
            dp_packet_batch_apply_cutlen(packets_);

#ifdef DPDK_NETDEV
            if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
                             && packets_->packets[0]->source
                                != p->output_pkts.packets[0]->source)) {
                /* XXX: netdev-dpdk assumes that all packets in a single
                 *      output batch has the same source. Flush here to
                 *      avoid memory access issues. */
                dp_netdev_pmd_flush_output_on_port(pmd, p);
            }
#endif
            if (dp_packet_batch_size(&p->output_pkts)
                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
                /* Flush here to avoid overflow. */
                dp_netdev_pmd_flush_output_on_port(pmd, p);
            }

            if (dp_packet_batch_is_empty(&p->output_pkts)) {
                pmd->n_output_batches++;
            }

            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
                p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
                                                             pmd->ctx.last_rxq;
                dp_packet_batch_add(&p->output_pkts, packet);
            }
            return;
        }
        break;

    case OVS_ACTION_ATTR_TUNNEL_PUSH:
        if (should_steal) {
            /* We're requested to push tunnel header, but also we need to take
             * the ownership of these packets. Thus, we can avoid performing
             * the action, because the caller will not use the result anyway.
             * Just break to free the batch. */
            break;
        }
        dp_packet_batch_apply_cutlen(packets_);
        push_tnl_action(pmd, a, packets_);
        return;

    case OVS_ACTION_ATTR_TUNNEL_POP:
        if (*depth < MAX_RECIRC_DEPTH) {
            struct dp_packet_batch *orig_packets_ = packets_;
            odp_port_t portno = nl_attr_get_odp_port(a);

            p = pmd_tnl_port_cache_lookup(pmd, portno);
            if (p) {
                struct dp_packet_batch tnl_pkt;

                if (!should_steal) {
                    dp_packet_batch_clone(&tnl_pkt, packets_);
                    packets_ = &tnl_pkt;
                    dp_packet_batch_reset_cutlen(orig_packets_);
                }

                dp_packet_batch_apply_cutlen(packets_);

                netdev_pop_header(p->port->netdev, packets_);
                if (dp_packet_batch_is_empty(packets_)) {
                    return;
                }

                struct dp_packet *packet;
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
                    packet->md.in_port.odp_port = portno;
                }

                (*depth)++;
                dp_netdev_recirculate(pmd, packets_);
                (*depth)--;
                return;
            }
        }
        break;

    case OVS_ACTION_ATTR_USERSPACE:
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
            struct dp_packet_batch *orig_packets_ = packets_;
            const struct nlattr *userdata;
            struct dp_packet_batch usr_pkt;
            struct ofpbuf actions;
            struct flow flow;
            ovs_u128 ufid;
            bool clone = false;

            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
            ofpbuf_init(&actions, 0);

            if (packets_->trunc) {
                if (!should_steal) {
                    dp_packet_batch_clone(&usr_pkt, packets_);
                    packets_ = &usr_pkt;
                    clone = true;
                    dp_packet_batch_reset_cutlen(orig_packets_);
                }

                dp_packet_batch_apply_cutlen(packets_);
            }

            struct dp_packet *packet;
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
                flow_extract(packet, &flow);
                dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
                                            &ufid, &actions, userdata);
            }

            if (clone) {
                dp_packet_delete_batch(packets_, true);
            }

            ofpbuf_uninit(&actions);
            fat_rwlock_unlock(&dp->upcall_rwlock);

            return;
        }
        break;

    case OVS_ACTION_ATTR_RECIRC:
        if (*depth < MAX_RECIRC_DEPTH) {
            struct dp_packet_batch recirc_pkts;

            if (!should_steal) {
               dp_packet_batch_clone(&recirc_pkts, packets_);
               packets_ = &recirc_pkts;
            }

            struct dp_packet *packet;
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
                packet->md.recirc_id = nl_attr_get_u32(a);
            }

            (*depth)++;
            dp_netdev_recirculate(pmd, packets_);
            (*depth)--;

            return;
        }

        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
        break;

    case OVS_ACTION_ATTR_CT: {
        const struct nlattr *b;
        bool force = false;
        bool commit = false;
        unsigned int left;
        uint16_t zone = 0;
        const char *helper = NULL;
        const uint32_t *setmark = NULL;
        const struct ovs_key_ct_labels *setlabel = NULL;
        struct nat_action_info_t nat_action_info;
        struct nat_action_info_t *nat_action_info_ref = NULL;
        bool nat_config = false;

        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
                                 nl_attr_get_size(a)) {
            enum ovs_ct_attr sub_type = nl_attr_type(b);

            switch(sub_type) {
            case OVS_CT_ATTR_FORCE_COMMIT:
                force = true;
                /* fall through. */
            case OVS_CT_ATTR_COMMIT:
                commit = true;
                break;
            case OVS_CT_ATTR_ZONE:
                zone = nl_attr_get_u16(b);
                break;
            case OVS_CT_ATTR_HELPER:
                helper = nl_attr_get_string(b);
                break;
            case OVS_CT_ATTR_MARK:
                setmark = nl_attr_get(b);
                break;
            case OVS_CT_ATTR_LABELS:
                setlabel = nl_attr_get(b);
                break;
            case OVS_CT_ATTR_EVENTMASK:
                /* Silently ignored, as userspace datapath does not generate
                 * netlink events. */
                break;
            case OVS_CT_ATTR_NAT: {
                const struct nlattr *b_nest;
                unsigned int left_nest;
                bool ip_min_specified = false;
                bool proto_num_min_specified = false;
                bool ip_max_specified = false;
                bool proto_num_max_specified = false;
                memset(&nat_action_info, 0, sizeof nat_action_info);
                nat_action_info_ref = &nat_action_info;

                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);

                    switch (sub_type_nest) {
                    case OVS_NAT_ATTR_SRC:
                    case OVS_NAT_ATTR_DST:
                        nat_config = true;
                        nat_action_info.nat_action |=
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
                        break;
                    case OVS_NAT_ATTR_IP_MIN:
                        memcpy(&nat_action_info.min_addr,
                               nl_attr_get(b_nest),
                               nl_attr_get_size(b_nest));
                        ip_min_specified = true;
                        break;
                    case OVS_NAT_ATTR_IP_MAX:
                        memcpy(&nat_action_info.max_addr,
                               nl_attr_get(b_nest),
                               nl_attr_get_size(b_nest));
                        ip_max_specified = true;
                        break;
                    case OVS_NAT_ATTR_PROTO_MIN:
                        nat_action_info.min_port =
                            nl_attr_get_u16(b_nest);
                        proto_num_min_specified = true;
                        break;
                    case OVS_NAT_ATTR_PROTO_MAX:
                        nat_action_info.max_port =
                            nl_attr_get_u16(b_nest);
                        proto_num_max_specified = true;
                        break;
                    case OVS_NAT_ATTR_PERSISTENT:
                    case OVS_NAT_ATTR_PROTO_HASH:
                    case OVS_NAT_ATTR_PROTO_RANDOM:
                        break;
                    case OVS_NAT_ATTR_UNSPEC:
                    case __OVS_NAT_ATTR_MAX:
                        OVS_NOT_REACHED();
                    }
                }

                if (ip_min_specified && !ip_max_specified) {
                    nat_action_info.max_addr = nat_action_info.min_addr;
                }
                if (proto_num_min_specified && !proto_num_max_specified) {
                    nat_action_info.max_port = nat_action_info.min_port;
                }
                if (proto_num_min_specified || proto_num_max_specified) {
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
                    }
                }
                break;
            }
            case OVS_CT_ATTR_UNSPEC:
            case __OVS_CT_ATTR_MAX:
                OVS_NOT_REACHED();
            }
        }

        /* We won't be able to function properly in this case, hence
         * complain loudly. */
        if (nat_config && !commit) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
        }

        conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
                          commit, zone, setmark, setlabel, aux->flow->tp_src,
                          aux->flow->tp_dst, helper, nat_action_info_ref,
                          pmd->ctx.now / 1000);
        break;
    }

    case OVS_ACTION_ATTR_METER:
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
                            pmd->ctx.now);
        break;

    case OVS_ACTION_ATTR_PUSH_VLAN:
    case OVS_ACTION_ATTR_POP_VLAN:
    case OVS_ACTION_ATTR_PUSH_MPLS:
    case OVS_ACTION_ATTR_POP_MPLS:
    case OVS_ACTION_ATTR_SET:
    case OVS_ACTION_ATTR_SET_MASKED:
    case OVS_ACTION_ATTR_SAMPLE:
    case OVS_ACTION_ATTR_HASH:
    case OVS_ACTION_ATTR_UNSPEC:
    case OVS_ACTION_ATTR_TRUNC:
    case OVS_ACTION_ATTR_PUSH_ETH:
    case OVS_ACTION_ATTR_POP_ETH:
    case OVS_ACTION_ATTR_CLONE:
    case OVS_ACTION_ATTR_PUSH_NSH:
    case OVS_ACTION_ATTR_POP_NSH:
    case OVS_ACTION_ATTR_CT_CLEAR:
    case __OVS_ACTION_ATTR_MAX:
        OVS_NOT_REACHED();
    }

    dp_packet_delete_batch(packets_, should_steal);
}

static void
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                          struct dp_packet_batch *packets,
                          bool should_steal, const struct flow *flow,
                          const struct nlattr *actions, size_t actions_len)
{
    struct dp_netdev_execute_aux aux = { pmd, flow };

    odp_execute_actions(&aux, packets, should_steal, actions,
                        actions_len, dp_execute_cb);
}

struct dp_netdev_ct_dump {
    struct ct_dpif_dump_state up;
    struct conntrack_dump dump;
    struct conntrack *ct;
    struct dp_netdev *dp;
};

static int
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
                          const uint16_t *pzone, int *ptot_bkts)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);
    struct dp_netdev_ct_dump *dump;

    dump = xzalloc(sizeof *dump);
    dump->dp = dp;
    dump->ct = &dp->conntrack;

    conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);

    *dump_ = &dump->up;

    return 0;
}

static int
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
                         struct ct_dpif_dump_state *dump_,
                         struct ct_dpif_entry *entry)
{
    struct dp_netdev_ct_dump *dump;

    INIT_CONTAINER(dump, dump_, up);

    return conntrack_dump_next(&dump->dump, entry);
}

static int
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
                         struct ct_dpif_dump_state *dump_)
{
    struct dp_netdev_ct_dump *dump;
    int err;

    INIT_CONTAINER(dump, dump_, up);

    err = conntrack_dump_done(&dump->dump);

    free(dump);

    return err;
}

static int
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
                     const struct ct_dpif_tuple *tuple)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    if (tuple) {
        return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
    }
    return conntrack_flush(&dp->conntrack, zone);
}

static int
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    return conntrack_set_maxconns(&dp->conntrack, maxconns);
}

static int
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    return conntrack_get_maxconns(&dp->conntrack, maxconns);
}

static int
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
{
    struct dp_netdev *dp = get_dp_netdev(dpif);

    return conntrack_get_nconns(&dp->conntrack, nconns);
}

const struct dpif_class dpif_netdev_class = {
    "netdev",
    dpif_netdev_init,
    dpif_netdev_enumerate,
    dpif_netdev_port_open_type,
    dpif_netdev_open,
    dpif_netdev_close,
    dpif_netdev_destroy,
    dpif_netdev_run,
    dpif_netdev_wait,
    dpif_netdev_get_stats,
    dpif_netdev_port_add,
    dpif_netdev_port_del,
    dpif_netdev_port_set_config,
    dpif_netdev_port_query_by_number,
    dpif_netdev_port_query_by_name,
    NULL,                       /* port_get_pid */
    dpif_netdev_port_dump_start,
    dpif_netdev_port_dump_next,
    dpif_netdev_port_dump_done,
    dpif_netdev_port_poll,
    dpif_netdev_port_poll_wait,
    dpif_netdev_flow_flush,
    dpif_netdev_flow_dump_create,
    dpif_netdev_flow_dump_destroy,
    dpif_netdev_flow_dump_thread_create,
    dpif_netdev_flow_dump_thread_destroy,
    dpif_netdev_flow_dump_next,
    dpif_netdev_operate,
    NULL,                       /* recv_set */
    NULL,                       /* handlers_set */
    dpif_netdev_set_config,
    dpif_netdev_queue_to_priority,
    NULL,                       /* recv */
    NULL,                       /* recv_wait */
    NULL,                       /* recv_purge */
    dpif_netdev_register_dp_purge_cb,
    dpif_netdev_register_upcall_cb,
    dpif_netdev_enable_upcall,
    dpif_netdev_disable_upcall,
    dpif_netdev_get_datapath_version,
    dpif_netdev_ct_dump_start,
    dpif_netdev_ct_dump_next,
    dpif_netdev_ct_dump_done,
    dpif_netdev_ct_flush,
    dpif_netdev_ct_set_maxconns,
    dpif_netdev_ct_get_maxconns,
    dpif_netdev_ct_get_nconns,
    NULL,                       /* ct_set_limits */
    NULL,                       /* ct_get_limits */
    NULL,                       /* ct_del_limits */
    dpif_netdev_meter_get_features,
    dpif_netdev_meter_set,
    dpif_netdev_meter_get,
    dpif_netdev_meter_del,
};

static void
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
                              const char *argv[], void *aux OVS_UNUSED)
{
    struct dp_netdev_port *port;
    struct dp_netdev *dp;
    odp_port_t port_no;

    ovs_mutex_lock(&dp_netdev_mutex);
    dp = shash_find_data(&dp_netdevs, argv[1]);
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
        ovs_mutex_unlock(&dp_netdev_mutex);
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
        return;
    }
    ovs_refcount_ref(&dp->ref_cnt);
    ovs_mutex_unlock(&dp_netdev_mutex);

    ovs_mutex_lock(&dp->port_mutex);
    if (get_port_by_name(dp, argv[2], &port)) {
        unixctl_command_reply_error(conn, "unknown port");
        goto exit;
    }

    port_no = u32_to_odp(atoi(argv[3]));
    if (!port_no || port_no == ODPP_NONE) {
        unixctl_command_reply_error(conn, "bad port number");
        goto exit;
    }
    if (dp_netdev_lookup_port(dp, port_no)) {
        unixctl_command_reply_error(conn, "port number already in use");
        goto exit;
    }

    /* Remove port. */
    hmap_remove(&dp->ports, &port->node);
    reconfigure_datapath(dp);

    /* Reinsert with new port number. */
    port->port_no = port_no;
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
    reconfigure_datapath(dp);

    seq_change(dp->port_seq);
    unixctl_command_reply(conn, NULL);

exit:
    ovs_mutex_unlock(&dp->port_mutex);
    dp_netdev_unref(dp);
}

static void
dpif_dummy_register__(const char *type)
{
    struct dpif_class *class;

    class = xmalloc(sizeof *class);
    *class = dpif_netdev_class;
    class->type = xstrdup(type);
    dp_register_provider(class);
}

static void
dpif_dummy_override(const char *type)
{
    int error;

    /*
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
     * a userland-only build.  It's useful for testsuite.
     */
    error = dp_unregister_provider(type);
    if (error == 0 || error == EAFNOSUPPORT) {
        dpif_dummy_register__(type);
    }
}

void
dpif_dummy_register(enum dummy_level level)
{
    if (level == DUMMY_OVERRIDE_ALL) {
        struct sset types;
        const char *type;

        sset_init(&types);
        dp_enumerate_types(&types);
        SSET_FOR_EACH (type, &types) {
            dpif_dummy_override(type);
        }
        sset_destroy(&types);
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
        dpif_dummy_override("system");
    }

    dpif_dummy_register__("dummy");

    unixctl_command_register("dpif-dummy/change-port-number",
                             "dp port new-number",
                             3, 3, dpif_dummy_change_port_number, NULL);
}

/* Datapath Classifier. */

/* A set of rules that all have the same fields wildcarded. */
struct dpcls_subtable {
    /* The fields are only used by writers. */
    struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */

    /* These fields are accessed by readers. */
    struct cmap rules;           /* Contains "struct dpcls_rule"s. */
    uint32_t hit_cnt;            /* Number of match hits in subtable in current
                                    optimization interval. */
    struct netdev_flow_key mask; /* Wildcards for fields (const). */
    /* 'mask' must be the last field, additional space is allocated here. */
};

/* Initializes 'cls' as a classifier that initially contains no classification
 * rules. */
static void
dpcls_init(struct dpcls *cls)
{
    cmap_init(&cls->subtables_map);
    pvector_init(&cls->subtables);
}

static void
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
{
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
    pvector_remove(&cls->subtables, subtable);
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
                subtable->mask.hash);
    cmap_destroy(&subtable->rules);
    ovsrcu_postpone(free, subtable);
}

/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
 * caller's responsibility.
 * May only be called after all the readers have been terminated. */
static void
dpcls_destroy(struct dpcls *cls)
{
    if (cls) {
        struct dpcls_subtable *subtable;

        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
            ovs_assert(cmap_count(&subtable->rules) == 0);
            dpcls_destroy_subtable(cls, subtable);
        }
        cmap_destroy(&cls->subtables_map);
        pvector_destroy(&cls->subtables);
    }
}

static struct dpcls_subtable *
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable;

    /* Need to add one. */
    subtable = xmalloc(sizeof *subtable
                       - sizeof subtable->mask.mf + mask->len);
    cmap_init(&subtable->rules);
    subtable->hit_cnt = 0;
    netdev_flow_key_clone(&subtable->mask, mask);
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
    /* Add the new subtable at the end of the pvector (with no hits yet) */
    pvector_insert(&cls->subtables, subtable, 0);
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
    pvector_publish(&cls->subtables);

    return subtable;
}

static inline struct dpcls_subtable *
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable;

    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
                             &cls->subtables_map) {
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
            return subtable;
        }
    }
    return dpcls_create_subtable(cls, mask);
}


/* Periodically sort the dpcls subtable vectors according to hit counts */
static void
dpcls_sort_subtable_vector(struct dpcls *cls)
{
    struct pvector *pvec = &cls->subtables;
    struct dpcls_subtable *subtable;

    PVECTOR_FOR_EACH (subtable, pvec) {
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
        subtable->hit_cnt = 0;
    }
    pvector_publish(pvec);
}

static inline void
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
                           struct polled_queue *poll_list, int poll_cnt)
{
    struct dpcls *cls;
    uint64_t tot_idle = 0, tot_proc = 0;
    unsigned int pmd_load = 0;

    if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
        uint64_t curr_tsc;
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
        if (pmd_alb->is_enabled && !pmd->isolated
            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
                                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
                                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
            {
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];

            if (tot_proc) {
                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
            }

            if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
                atomic_count_inc(&pmd->pmd_overloaded);
            } else {
                atomic_count_set(&pmd->pmd_overloaded, 0);
            }
        }

        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];

        /* Get the cycles that were used to process each queue and store. */
        for (unsigned i = 0; i < poll_cnt; i++) {
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
                                                        RXQ_CYCLES_PROC_CURR);
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
                                     0);
        }
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
        if (pmd->intrvl_tsc_prev) {
            /* There is a prev timestamp, store a new intrvl cycle count. */
            atomic_store_relaxed(&pmd->intrvl_cycles,
                                 curr_tsc - pmd->intrvl_tsc_prev);
        }
        pmd->intrvl_tsc_prev = curr_tsc;
        /* Start new measuring interval */
        pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
    }

    if (pmd->ctx.now > pmd->next_optimization) {
        /* Try to obtain the flow lock to block out revalidator threads.
         * If not possible, just try next time. */
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
            /* Optimize each classifier */
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
                dpcls_sort_subtable_vector(cls);
            }
            ovs_mutex_unlock(&pmd->flow_mutex);
            /* Start new measuring interval */
            pmd->next_optimization = pmd->ctx.now
                                     + DPCLS_OPTIMIZATION_INTERVAL;
        }
    }
}

/* Insert 'rule' into 'cls'. */
static void
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
             const struct netdev_flow_key *mask)
{
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);

    /* Refer to subtable's mask, also for later removal. */
    rule->mask = &subtable->mask;
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
}

/* Removes 'rule' from 'cls', also destructing the 'rule'. */
static void
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
{
    struct dpcls_subtable *subtable;

    ovs_assert(rule->mask);

    /* Get subtable from reference in rule->mask. */
    INIT_CONTAINER(subtable, rule->mask, mask);
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
        == 0) {
        /* Delete empty subtable. */
        dpcls_destroy_subtable(cls, subtable);
        pvector_publish(&cls->subtables);
    }
}

/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
 * in 'mask' the values in 'key' and 'target' are the same. */
static bool
dpcls_rule_matches_key(const struct dpcls_rule *rule,
                       const struct netdev_flow_key *target)
{
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
    uint64_t value;

    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
            return false;
        }
    }
    return true;
}

/* For each miniflow in 'keys' performs a classifier lookup writing the result
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
 * NULL it is skipped.
 *
 * This function is optimized for use in the userspace datapath and therefore
 * does not implement a lot of features available in the standard
 * classifier_lookup() function.  Specifically, it does not implement
 * priorities, instead returning any rule which matches the flow.
 *
 * Returns true if all miniflows found a corresponding rule. */
static bool
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
             struct dpcls_rule **rules, const size_t cnt,
             int *num_lookups_p)
{
    /* The received 'cnt' miniflows are the search-keys that will be processed
     * to find a matching entry into the available subtables.
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
    typedef uint32_t map_type;
#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);

    struct dpcls_subtable *subtable;

    map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
    map_type found_map;
    uint32_t hashes[MAP_BITS];
    const struct cmap_node *nodes[MAP_BITS];

    if (cnt != MAP_BITS) {
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
    }
    memset(rules, 0, cnt * sizeof *rules);

    int lookups_match = 0, subtable_pos = 1;

    /* The Datapath classifier - aka dpcls - is composed of subtables.
     * Subtables are dynamically created as needed when new rules are inserted.
     * Each subtable collects rules with matches on a specific subset of packet
     * fields as defined by the subtable's mask.  We proceed to process every
     * search-key against each subtable, but when a match is found for a
     * search-key, the search for that key can stop because the rules are
     * non-overlapping. */
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
        int i;

        /* Compute hashes for the remaining keys.  Each search-key is
         * masked with the subtable's mask to avoid hashing the wildcarded
         * bits. */
        ULLONG_FOR_EACH_1(i, keys_map) {
            hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
                                                     &subtable->mask);
        }
        /* Lookup. */
        found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
        /* Check results.  When the i-th bit of found_map is set, it means
         * that a set of nodes with a matching hash value was found for the
         * i-th search-key.  Due to possible hash collisions we need to check
         * which of the found rules, if any, really matches our masked
         * search-key. */
        ULLONG_FOR_EACH_1(i, found_map) {
            struct dpcls_rule *rule;

            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
                if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
                    rules[i] = rule;
                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
                     * within one second optimization interval. */
                    subtable->hit_cnt++;
                    lookups_match += subtable_pos;
                    goto next;
                }
            }
            /* None of the found rules was a match.  Reset the i-th bit to
             * keep searching this key in the next subtable. */
            ULLONG_SET0(found_map, i);  /* Did not match. */
        next:
            ;                     /* Keep Sparse happy. */
        }
        keys_map &= ~found_map;             /* Clear the found rules. */
        if (!keys_map) {
            if (num_lookups_p) {
                *num_lookups_p = lookups_match;
            }
            return true;              /* All found. */
        }
        subtable_pos++;
    }
    if (num_lookups_p) {
        *num_lookups_p = lookups_match;
    }
    return false;                     /* Some misses. */
}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/*
-												dpif-netdev: Fix use-after-free error in reconfigure_datapath().

Found by Coverity.

Reported-at: https://scan3.coverity.com/reports.htm#v16889/p10449/fileInstanceId=14762915&defectInstanceId=4305352&mergedDefectId=180430
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-05-26 15:50:16 -07:00
+								 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								 *
 								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at:
 								 *
 								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
 								#include <config.h>
-												netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads

DPDK mempools rely on rte_lcore_id() to implement a thread-local cache.
Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to
the "thread-local" cache, causing crashes.

This commit resolves the issue with the following changes:

- Every non pmd thread has the same lcore_id (0, for management reasons), which
  is not shared with any pmd thread (lcore_id for pmd threads now start from 1)
- DPDK mbufs must be allocated/freed in pmd threads. When there is the need to
  use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be
  held.
- The previous change does not allow us anymore to pass DPDK mbufs to handler
  threads: therefore this commit partially revert 143859ec63d45e. Now packets
  are copied for upcall processing. We can remove the extra memcpy by
  processing upcalls in the pmd thread itself.

With the introduction of the extra locking, the packet throughput will be lower
in the following cases:

- When using internal (tap) devices with DPDK devices on the same datapath.
  Anyway, to support internal devices efficiently, we needed DPDK KNI devices,
  which will be proper pmd devices and will not need this locking.
- When packets are processed in the slow path by non pmd threads. This overhead
  can be avoided by handling the upcalls directly in pmd threads (a change that
  has already been proposed by Ryan Wilson)

Also, the following two fixes have been introduced:
- In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put().
  This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option
- Do not bulk free mbufs in a transmission queue. They may belong to different
  mempools

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-07-17 14:29:36 -07:00
+								#include "dpif-netdev.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								#include <ctype.h>
 								#include <errno.h>
 								#include <fcntl.h>
 								#include <inttypes.h>
-												Work around bugs in system headers.

On some system, at least, one must include <sys/types.h> before
<netinet/in.h>, and <netinet/in.h> before <arpa/inet.h> or <net/if.h>.

From Jean Tourrilhes <jt@hpl.hp.com>.

											
										
										
											2010-02-12 12:51:36 -08:00
+								#include <net/if.h>
-												sparse: Add guards to prevent FreeBSD-incompatible #include order.

FreeBSD insists that <sys/types.h> be included before <netinet/in.h> and
that <netinet/in.h> be included before <arpa/inet.h>.  This adds guards to
the "sparse" headers to yield a warning if this order is violated.  This
commit also adjusts the order of many #includes to suit this requirement.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-11-06 14:42:32 -08:00
+								#include <sys/types.h>
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include <netinet/in.h>
-												datapath: Replace "struct odp_action" by Netlink attributes.

In the medium term, we plan to migrate the datapath to use Netlink as its
communication channel.  In the short term, we need to be able to have
actions with 64-bit arguments but "struct odp_action" only has room for
48 bits.  So this patch shifts to variable-length arguments using Netlink
attributes, which starts in on the Netlink transition and makes 64-bit
arguments possible at the same time.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 10:40:58 -08:00
+								#include <stdint.h>
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include <stdlib.h>
 								#include <string.h>
 								#include <sys/ioctl.h>
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include <sys/socket.h>
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include <sys/stat.h>
 								#include <unistd.h>
-												dpif-netdev: Don't use metaflow to operate on userspace datapath fields.

If ofproto-dpif installs a flow into the userspace datapath that doesn't
include a mask, we need to synthesize an exact match one. This is currently
done using the metaflow infrastructure, iterating over each field and
setting it to all ones.

There is a conceptual mismatch here because metaflow is operating on
OpenFlow fields, not datapath ones. Even though they are generally very
similar, there are subtle differences, which is why it is necessary to
fix up the input port mask.

With Geneve options, the mapping is much more complicated and so the
situation is worse. The first issue is that the metaflow to flow
mapping can change over time, so we would need to do more revalidation
to track this. In addition, an upcoming patch will completely disconnect
the option format between ofproto-dpif and dpif-netdev, so the values
written by metaflow don't make sense at all.

When megaflows are turned off, ofproto-dpif internally generates masks
using flow_wildcards_init_for_packet(). Since that's the same as what
we want to do here, we can just use that instead of metaflow.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-07-16 09:05:33 -07:00
+								#include "bitmap.h"
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								#include "cmap.h"
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#include "conntrack.h"
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
+								#include "coverage.h"
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#include "ct-dpif.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "csum.h"
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								#include "dp-packet.h"
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								#include "dpif.h"
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								#include "dpif-netdev-perf.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "dpif-provider.h"
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								#include "dummy.h"
-												lib/classifier: Lockless lookups.

Now that all the relevant classifier structures use RCU and internal
mutual exclusion for modifications, we can remove the fat-rwlock and
thus make the classifier lookups lockless.

As the readers are operating concurrently with the writers, a
concurrent reader may or may not see a new rule being added by a
writer, depending on how the concurrent events overlap with each
other.  Overall, this is no different from the former locked behavior,
but there the visibility of the new rule only depended on the timing
of the locking functions.

A new rule is first added to the segment indices, so the readers may
find the rule in the indices before the rule is visible in the
subtables 'rules' map.  This may result in us losing the opportunity
to quit lookups earlier, resulting in sub-optimal wildcarding.  This
will be fixed by forthcoming revalidation always scheduled after flow
table changes.

Similar behavior may happen due to us removing the overlapping rule
(if any) from the indices only after the corresponding new rule has
been added.

The subtable's max priority is updated only after a rule is inserted
to the maps, so the concurrent readers may not see the rule, as the
updated priority ordered subtable list will only be visible after the
subtable's max priority is updated.

Similarly, the classifier's partitions are updated by the caller after
the rule is inserted to the maps, so the readers may keep skipping the
subtable until they see the updated partitions.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
											
										
										
											2014-07-11 02:29:08 -07:00
+								#include "fat-rwlock.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "flow.h"
-												dpif-netdev: Reload each thread only once in do_add_port.

While adding of pmd interface with multiple queues several queues
may be assigned to one thread and this thread will be reloaded
one time for each added queue.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:29 +03:00
+								#include "hmapx.h"
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								#include "id-pool.h"
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								#include "latch.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "netdev.h"
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								#include "netdev-provider.h"
-												netdev: New function netdev_get_dpif_port().

In future patches, a netdev's datapath port name may not
necessarily be the same as its device name. This patch prepares for
this by making the distinction in the netdev and dpif layers.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-12-16 17:08:50 -08:00
+								#include "netdev-vport.h"
-												datapath: Replace "struct odp_action" by Netlink attributes.

In the medium term, we plan to migrate the datapath to use Netlink as its
communication channel.  In the short term, we need to be able to have
actions with 64-bit arguments but "struct odp_action" only has room for
48 bits.  So this patch shifts to variable-length arguments using Netlink
attributes, which starts in on the Netlink transition and makes 64-bit
arguments possible at the same time.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-10 10:40:58 -08:00
+								#include "netlink.h"
-												odp-execute: New module for executing datapath actions.

This moves generic action execution code out of lib/dpif-netedev.c
and into a new file, lib/odp-execute.c.

This is in preparation for using odp_execute_actions()
in lib/odp-util.c to handle recirculation/

Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-29 15:06:38 +09:00
+								#include "odp-execute.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "odp-util.h"
-												Move lib/ofp-print.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-14 15:20:21 -07:00
+								#include "openvswitch/dynamic-string.h"
 								#include "openvswitch/list.h"
 								#include "openvswitch/match.h"
-												ofp-util, ofp-parse: Break up into many separate modules.

ofp-util had been far too large and monolithic for a long time.  This
commit breaks it up into units that make some logical sense.  It also
moves the pieces of ofp-parse that were specific to each unit into the
relevant unit.

Most of this commit is just moving code around.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Reviewed-by: Yifeng Sun <pkusunyifeng@gmail.com>

											
										
										
											2018-02-09 10:04:26 -08:00
+								#include "openvswitch/ofp-parse.h"
-												Move lib/ofp-print.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-14 15:20:21 -07:00
+								#include "openvswitch/ofp-print.h"
-												Move lib/ofpbuf.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Acked-by: Ryan Moats <rmoats@us.ibm.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-03-25 14:10:24 -07:00
+								#include "openvswitch/ofpbuf.h"
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								#include "openvswitch/shash.h"
-												Move lib/ofp-print.h to include/openvswitch directory

Signed-off-by: Ben Warren <ben@skyportsystems.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-14 15:20:21 -07:00
+								#include "openvswitch/vlog.h"
-												dpif-netdev: Create multiple tx/rx queues when adding dpdk interface.

Before this commit, ovs creates one tx and one rx queue for
each dpdk interface and uses only one poll thread for handling
I/O of all dpdk interfaces.  An upcoming patch will allow multiple
poll threads be created.  As a preparation, this commit changes
the dpif-netdev to create multiple tx/rx queues when the dpdk
interface is added.

Specifically, the number of rx queues will still be one per-dpdk
interface for this commit.  But upcoming work will allow user
create multiple rx queues.  The number of tx queues will be the
number of cpu cores on the machine.  Although not all the tx queues
will be used, each poll thread will have its own queue for
transmission on the dpdk interface.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-06-17 10:52:20 -07:00
+								#include "ovs-numa.h"
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								#include "ovs-rcu.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "packets.h"
-												lib: Move lib/poll-loop.h to include/openvswitch

Poll-loop is the core to implement main loop. It should be available in
libopenvswitch.

Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-11-03 13:53:53 +08:00
+								#include "openvswitch/poll-loop.h"
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								#include "pvector.h"
-												dpif-netdev: Implement OVS_ACTION_ATTR_SAMPLE action.

OVS_ACTION_ATTR_SAMPLE has never been implemented in dpif-netdev.  This
commit implements it and adds a cast to enum ovs_action_type in the switch
statement that checks the action type, so that GCC complains if we forget
to add a case for a new action type.

I had to assign the return value of nl_attr_type() to a temporary variable,
because "switch ((enum ovs_action_type) nl_attr_type(a))" provoked a GCC
warning that I've never seen before:

../lib/dpif-netdev.c:1260: warning: cast from function call of type 'int'
     to non-matching type 'enum ovs_action_type'

											
										
										
											2011-10-11 11:07:14 -07:00
+								#include "random.h"
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								#include "seq.h"
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								#include "smap.h"
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								#include "sset.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "timeval.h"
-												tnl-arp-cache: Rename module and functions to tnl-neigh-cache.

Since we don't distinguish between IPv4 and IPv6 lookups, consolidate ARP
and ND cache into neighbor cache. Other references to ARP related to the
ARP cache but that are not really about ARP have been renamed as well.
tnl_arp_lookup is kept for lookups using IPv4 instead of IPv4-mapped
addresses, but that is going to be removed in a later patch.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-11-30 16:24:49 -02:00
+								#include "tnl-neigh-cache.h"
-												tnl-ports: Add destination IP and MAC address to the match.

Currently tnl-port table wildcard destination ip and mac addresses
for given tunnel packet.  That could result accepting tunnel
packets destined for other hosts.  Following patch adds
support for matching for ip and mac address.
IP address upates to tnl-port table are piggybacked on
ovs-router updates.

Reported-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-09-03 00:42:34 -07:00
+								#include "tnl-ports.h"
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								#include "unixctl.h"
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								#include "util.h"
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								#include "uuid.h"
-												dpif-netdev: Fix double inclusion of cmap.h

Also, all headers sorted lexicographically.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-02-25 10:51:01 +03:00
-												vlog: Make client supply semicolon for VLOG_DEFINE_THIS_MODULE.

It's kind of odd for VLOG_DEFINE_THIS_MODULE to supply its own semicolon,
so this commit switches to the more common form.

											
										
										
											2010-10-19 14:47:01 -07:00
+								VLOG_DEFINE_THIS_MODULE(dpif_netdev);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								/* Auto Load Balancing Defaults */
 								#define ALB_ACCEPTABLE_IMPROVEMENT       25
 								#define ALB_PMD_LOAD_THRESHOLD           95
 								#define ALB_PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
 								#define MIN_TO_MSEC                  60000
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								#define FLOW_DUMP_MAX_BATCH 50
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								/* Use per thread recirc_depth to prevent recirculation loop. */
-												dpif-netdev: Set MAX_RECIRC_DEPTH to 6.

In an ovn gateway node with DPDK, the RECIRC_DEPTH may be greater than 5.

Scenes:
VM ping self floating IP, or
VM ping Floating IP of VMs with the same network.

It need process UNDNAT SNAT in LRouter egress and
UNSNAT DNAT in LRouter ingress, and
output to geneve tunnel also need recirc.

This has an WARN:
dpif_netdev(pmd36)|WARN|Packet dropped. Max recirculation depth exceeded.

Signed-off-by: Guoshuai Li <ligs@dtdream.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-09-22 22:49:43 +08:00
+								#define MAX_RECIRC_DEPTH 6
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								/* Use instant packet send by default. */
 								#define DEFAULT_TX_FLUSH_INTERVAL 0
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* Configuration parameters. */
 								enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
 								enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
 								enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Protects against changes to 'dp_netdevs'. */
 								static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
 								/* Contains all 'struct dp_netdev's. */
 								static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 								    = SHASH_INITIALIZER(&dp_netdevs);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
-												dpdk: Parse NAT netlink for userspace datapath.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-30 10:49:25 -07:00
+								                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 								                                     | CS_SRC_NAT | CS_DST_NAT)
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								static struct odp_support dp_netdev_support = {
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								    .max_vlan_headers = SIZE_MAX,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								    .max_mpls_depth = SIZE_MAX,
 								    .recirc = true,
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    .ct_state = true,
 								    .ct_zone = true,
 								    .ct_mark = true,
 								    .ct_label = true,
-												dpif-netdev: Indicate support for various ct features.

The userspace datapath uses a structure to indicate supported features
that affects debug output.  This commit updates that structure to
indicate that "ct_state_nat", "ct_orig_tuple", and "ct_orig_tuple6" are
supported.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-07-18 22:55:35 -07:00
+								    .ct_state_nat = true,
 								    .ct_orig_tuple = true,
 								    .ct_orig_tuple6 = true,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								};
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								/* Stores a miniflow with inline values */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
 								struct netdev_flow_key {
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    uint32_t hash;       /* Hash function differs for different users. */
 								    uint32_t len;        /* Length of the following miniflow (incl. map). */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct miniflow mf;
-												flow: Always inline miniflows.

Now that performance critical code already inlines miniflows and
minimasks, we can simplify struct miniflow by always dynamically
allocating miniflows and minimasks to the correct size.  This changes
the struct minimatch to always contain pointers to its miniflow and
minimask.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-15 13:17:01 -07:00
+								    uint64_t buf[FLOW_MAX_PACKET_U64S];
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								/* EMC cache and SMC cache compose the datapath flow cache (DFC)
 								 *
 								 * Exact match cache for frequently used flows
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 *
 								 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 								 * search its entries for a miniflow that matches exactly the miniflow of the
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 *
 								 * A cache entry holds a reference to its 'dp_netdev_flow'.
 								 *
 								 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 								 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 								 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 								 * value is the index of a cache entry where the miniflow could be.
 								 *
 								 *
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								 * Signature match cache (SMC)
 								 *
 								 * This cache stores a 16-bit signature for each flow without storing keys, and
 								 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 								 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 								 * SMC uses a set-associative design that each bucket contains
 								 * SMC_ENTRY_PER_BUCKET number of entries.
 								 * Since 16-bit flow_table index is used, if there are more than 2^16
 								 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 								 *
 								 *
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 * Thread-safety
 								 * =============
 								 *
 								 * Each pmd_thread has its own private exact match cache.
 								 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 								 */
-												dpif-netdev: Increase the number of EMC entries

Prior to this commit, the number of possible entries in the Exact
Match Cache stood at 1024 per thread exacting to 0.18Mb. A typical
server system will have 2.5Mb cache per core meaning a larger EMC will
comfortably fit in. This patch increases the number of entries to 8192
per thread (1.4Mb) which in turn yields improved throughput when
processing multiple flows of traffic.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-13 14:54:56 +01:00
+								#define EM_FLOW_HASH_SHIFT 13
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 								#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 								#define EM_FLOW_HASH_SEGS 2
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								/* SMC uses a set-associative design. A bucket contains a set of entries that
 								 * a flow item can occupy. For now, it uses one hash function rather than two
 								 * as for the EMC design. */
 								#define SMC_ENTRY_PER_BUCKET 4
 								#define SMC_ENTRIES (1u << 20)
 								#define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
 								#define SMC_MASK (SMC_BUCKET_CNT - 1)
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								/* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 								#define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 								#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 								                                    DEFAULT_EM_FLOW_INSERT_INV_PROB)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								struct emc_entry {
 								    struct dp_netdev_flow *flow;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct netdev_flow_key key;   /* key.hash used for emc hash value. */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
 								struct emc_cache {
 								    struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								    int sweep_idx;                /* For emc_cache_slow_sweep(). */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								};
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								struct smc_bucket {
 								    uint16_t sig[SMC_ENTRY_PER_BUCKET];
 								    uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
 								};
 								/* Signature match cache, differentiate from EMC cache */
 								struct smc_cache {
 								    struct smc_bucket buckets[SMC_BUCKET_CNT];
 								};
 								struct dfc_cache {
 								    struct emc_cache emc_cache;
 								    struct smc_cache smc_cache;
 								};
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								/* Iterate in the exact match cache through every entry that might contain a
 								 * miniflow with hash 'HASH'. */
 								#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 								    for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 								         (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 								         i__ < EM_FLOW_HASH_SEGS;                                            \
 								         i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Simple non-wildcarding single-priority classifier. */
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								/* Time in microseconds between successive optimizations of the dpcls
 								 * subtable vector */
 								#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								/* Time in microseconds of the interval in which rxq processing cycles used
 								 * in rxq to pmd assignments is measured and stored. */
 								#define PMD_RXQ_INTERVAL_LEN 10000000LL
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
-												dpif-netdev: Add rxq processing cycle counters.

Add counters to dp_netdev_rxq which will later be used for storing the
processing cycles of an rxq. Processing cycles will be stored in reference
to a defined time interval. We will store the cycles of the current in progress
interval, a number of completed intervals and the sum of the completed
intervals.

cycles_count_intermediate was used to count cycles for a pmd. With some small
additions we can also use it to count the cycles used for processing an rxq.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:42:06 -07:00
+								/* Number of intervals for which cycles are stored
 								 * and used during rxq to pmd assignment. */
 								#define PMD_RXQ_INTERVAL_MAX 6
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								struct dpcls {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 								    odp_port_t in_port;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct cmap subtables_map;
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    struct pvector subtables;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								};
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								/* A rule to be inserted to the classifier. */
 								struct dpcls_rule {
 								    struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 								    struct netdev_flow_key *mask; /* Subtable's mask. */
 								    struct netdev_flow_key flow;  /* Matching key. */
 								    /* 'flow' must be the last field, additional space is allocated here. */
 								};
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								/* Data structure to keep packet order till fastpath processing. */
 								struct dp_packet_flow_map {
 								    struct dp_packet *packet;
 								    struct dp_netdev_flow *flow;
 								    uint16_t tcp_flags;
 								};
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static void dpcls_init(struct dpcls *);
 								static void dpcls_destroy(struct dpcls *);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static void dpcls_sort_subtable_vector(struct dpcls *);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 								                         const struct netdev_flow_key *mask);
 								static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static bool dpcls_lookup(struct dpcls *cls,
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                         const struct netdev_flow_key *keys[],
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                         struct dpcls_rule **rules, size_t cnt,
 								                         int *num_lookups_p);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
 								                            const struct netdev_flow_key *target);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								/* Set of supported meter flags */
 								#define DP_SUPPORTED_METER_FLAGS_MASK \
 								    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 								/* Set of supported meter band types */
 								#define DP_SUPPORTED_METER_BAND_TYPES           \
 								    ( 1 << OFPMBT13_DROP )
 								struct dp_meter_band {
 								    struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 								    uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 								    uint64_t packet_count;
 								    uint64_t byte_count;
 								};
 								struct dp_meter {
 								    uint16_t flags;
 								    uint16_t n_bands;
 								    uint32_t max_delta_t;
 								    uint64_t used;
 								    uint64_t packet_count;
 								    uint64_t byte_count;
 								    struct dp_meter_band bands[];
 								};
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								struct pmd_auto_lb {
 								    bool auto_lb_requested;     /* Auto load balancing requested by user. */
 								    bool is_enabled;            /* Current status of Auto load balancing. */
 								    uint64_t rebalance_intvl;
 								    uint64_t rebalance_poll_timer;
 								};
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Datapath based on the network device interface from netdev.h.
 								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
 								 * Some members, marked 'const', are immutable.  Accessing other members
 								 * requires synchronization, as noted in more detail below.
 								 *
 								 * Acquisition order is, from outermost to innermost:
 								 *
 								 *    dp_netdev_mutex (global)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								 *    port_mutex
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								 *    non_pmd_mutex
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								struct dp_netdev {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    const struct dpif_class *const class;
 								    const char *const name;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    struct dpif *dpif;
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    struct ovs_refcount ref_cnt;
 								    atomic_flag destroyed;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    /* Ports.
 								     *
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								     * Any lookup into 'ports' or any access to the dp_netdev_ports found
 								     * through 'ports' requires taking 'port_mutex'. */
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    struct ovs_mutex port_mutex;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap ports;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    struct seq *port_seq;       /* Incremented whenever a port changes. */
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    /* The time that a packet can wait in output batch for sending. */
 								    atomic_uint32_t tx_flush_interval;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    /* Meters. */
 								    struct ovs_mutex meter_locks[N_METER_LOCKS];
 								    struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
-												dpif-netdev: Reorder elements in dp_netdev structure.

'emc_insert_min' variable is made to align on a 64-byte boundary and this
introduces a 24 byte hole.

This patch moves the emc_insert_min member variable slightly higher in
the order to remove the hole and thus saves a cache line with the new
ordering.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ciara Loftus <ciara.loftus@intel.com>
CC: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>

											
										
										
											2017-03-12 17:33:25 +00:00
+								    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 								    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    /* Enable collection of PMD performance metrics. */
 								    atomic_bool pmd_perf_metrics;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    /* Enable the SMC cache from ovsdb config */
 								    atomic_bool smc_enable_db;
-												dpif-netdev: Reorder elements in dp_netdev structure.

'emc_insert_min' variable is made to align on a 64-byte boundary and this
introduces a 24 byte hole.

This patch moves the emc_insert_min member variable slightly higher in
the order to remove the hole and thus saves a cache line with the new
ordering.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ciara Loftus <ciara.loftus@intel.com>
CC: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>

											
										
										
											2017-03-12 17:33:25 +00:00
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    /* Protects access to ofproto-dpif-upcall interface during revalidator
 								     * thread synchronization. */
 								    struct fat_rwlock upcall_rwlock;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 								    void *upcall_aux;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    /* Callback function for notifying the purging of dp flows (during
 								     * reseting pmd deletion). */
 								    dp_purge_callback *dp_purge_cb;
 								    void *dp_purge_aux;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Stores all 'struct dp_netdev_pmd_thread's. */
 								    struct cmap poll_threads;
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    /* id pool for per thread static_tx_qid. */
 								    struct id_pool *tx_qid_pool;
 								    struct ovs_mutex tx_qid_pool_mutex;
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								    /* Use measured cycles for rxq to pmd assignment. */
 								    bool pmd_rxq_assign_cyc;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
 								    /* Protects the access of the 'struct dp_netdev_pmd_thread'
 								     * instance for non-pmd thread. */
 								    struct ovs_mutex non_pmd_mutex;
 								    /* Each pmd thread will store its pointer to
 								     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 								    ovsthread_key_t per_pmd_key;
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    struct seq *reconfigure_seq;
 								    uint64_t last_reconfigure_seq;
-												dpif-netdev: Allow different numbers of rx queues for different ports.

Currently, all of the PMD netdevs can only have the same number of
rx queues, which is specified in other_config:n-dpdk-rxqs.

Fix that by introducing of new option for PMD interfaces: 'n_rxq', which
specifies the maximum number of rx queues to be created for this
interface.

Example:
	ovs-vsctl set Interface dpdk0 options:n_rxq=8

Old 'other_config:n-dpdk-rxqs' deleted.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-21 17:15:18 +03:00
+								    /* Cpu mask for pin of pmd threads. */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    char *pmd_cmask;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    uint64_t last_tnl_conf_seq;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
 								    struct conntrack conntrack;
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    struct pmd_auto_lb pmd_alb;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 								    OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 								{
 								    ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 								}
 								static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 								    OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 								{
 								    ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 								}
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                                                    odp_port_t)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
-												dpif-netdev: Add rxq processing cycle counters.

Add counters to dp_netdev_rxq which will later be used for storing the
processing cycles of an rxq. Processing cycles will be stored in reference
to a defined time interval. We will store the cycles of the current in progress
interval, a number of completed intervals and the sum of the completed
intervals.

cycles_count_intermediate was used to count cycles for a pmd. With some small
additions we can also use it to count the cycles used for processing an rxq.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:42:06 -07:00
+								enum rxq_cycles_counter_type {
 								    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 								                                   processing packets during the current
 								                                   interval. */
 								    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 								                                   during rxq to pmd assignment. */
 								    RXQ_N_CYCLES
 								};
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								enum {
 								    DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
 								    DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
 								    DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
 								};
 								struct dp_flow_offload_item {
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct dp_netdev_flow *flow;
 								    int op;
 								    struct match match;
 								    struct nlattr *actions;
 								    size_t actions_len;
 								    struct ovs_list node;
 								};
 								struct dp_flow_offload {
 								    struct ovs_mutex mutex;
 								    struct ovs_list list;
 								    pthread_cond_t cond;
 								};
 								static struct dp_flow_offload dp_flow_offload = {
 								    .mutex = OVS_MUTEX_INITIALIZER,
 								    .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
 								};
 								static struct ovsthread_once offload_thread_once
 								    = OVSTHREAD_ONCE_INITIALIZER;
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								#define XPS_TIMEOUT 500000LL    /* In microseconds. */
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Contained by struct dp_netdev_port's 'rxqs' member.  */
 								struct dp_netdev_rxq {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct dp_netdev_port *port;
 								    struct netdev_rxq *rx;
 								    unsigned core_id;                  /* Core to which this queue should be
 								                                          pinned. OVS_CORE_UNSPEC if the
 								                                          queue doesn't need to be pinned to a
 								                                          particular core. */
-												dpif-netdev: Reorder elements in dp_netdev_rxq structure.

By reordering elements in dp_netdev_rxq structure, pad bytes and a hole
can be removed.

Before: structure size: 104, sum holes: 1, sum padbytes:4, cachelines:2
After : structure size:  96, sum holes: 0, sum padbytes:0, cachelines:2

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-09-08 18:59:20 +01:00
+								    unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
-												dpif-netdev/netdev-dpdk: Fix line lengths.

Fix line lengths to be <= 79 as per coding style and so that checkpatch
will not show up existing warnings on these files.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-05 11:52:05 +01:00
+								    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    bool is_vhost;                     /* Is rxq of a vhost port. */
-												dpif-netdev: Add rxq processing cycle counters.

Add counters to dp_netdev_rxq which will later be used for storing the
processing cycles of an rxq. Processing cycles will be stored in reference
to a defined time interval. We will store the cycles of the current in progress
interval, a number of completed intervals and the sum of the completed
intervals.

cycles_count_intermediate was used to count cycles for a pmd. With some small
additions we can also use it to count the cycles used for processing an rxq.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:42:06 -07:00
 								    /* Counters of cycles spent successfully polling and processing pkts. */
 								    atomic_ullong cycles[RXQ_N_CYCLES];
 								    /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 								       sum them to yield the cycles used for an rxq. */
 								    atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								};
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* A port in a netdev-based datapath. */
 								struct dp_netdev_port {
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    odp_port_t port_no;
-												dpif-netdev: Reorder elements in dp_netdev_port structure.

By reordering the elements in dp_netdev_port structure, pad bytes can be
reduced there by saving a cache line. Marginal performance improvement
is also observed with this change.

Before: structure size: 136, holes: 7, sum padbytes:7, cachelines:3
After : structure size: 128, holes: 6, sum padbytes:0, cachelines:2

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Tested-by: Greg Rose <gvrose8192@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 20:13:38 -07:00
+								    bool dynamic_txqs;          /* If true XPS will be used. */
 								    bool need_reconfigure;      /* True if we should reconfigure netdev. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct netdev *netdev;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    struct netdev_saved_flags *sf;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    struct dp_netdev_rxq *rxqs;
-												dpif-netdev: Fix few comments.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-07-06 14:12:00 +03:00
+								    unsigned n_rxq;             /* Number of elements in 'rxqs' */
-												dpif-netdev/netdev-dpdk: Fix line lengths.

Fix line lengths to be <= 79 as per coding style and so that checkpatch
will not show up existing warnings on these files.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-05 11:52:05 +01:00
+								    unsigned *txq_used;         /* Number of threads that use each tx queue. */
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    struct ovs_mutex txq_used_mutex;
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    bool emc_enabled;           /* If true EMC will be used. */
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								    char *type;                 /* Port type as requested by user. */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Contained by struct dp_netdev_flow's 'stats' member.  */
 								struct dp_netdev_flow_stats {
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    atomic_llong used;             /* Last used time, in monotonic msecs. */
 								    atomic_ullong packet_count;    /* Number of packets matched. */
 								    atomic_ullong byte_count;      /* Number of bytes matched. */
 								    atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								};
 								/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
 								 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 * its pmd thread's classifier.  The text below calls this classifier 'cls'.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * Motivation
 								 * ----------
 								 *
 								 * The thread safety rules described here for "struct dp_netdev_flow" are
 								 * motivated by two goals:
 								 *
 								 *    - Prevent threads that read members of "struct dp_netdev_flow" from
 								 *      reading bad data due to changes by some thread concurrently modifying
 								 *      those members.
 								 *
 								 *    - Prevent two threads making changes to members of a given "struct
 								 *      dp_netdev_flow" from interfering with each other.
 								 *
 								 *
 								 * Rules
 								 * -----
 								 *
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								 * A flow 'flow' may be accessed without a risk of being freed during an RCU
 								 * grace period.  Code that needs to hold onto a flow for a while
 								 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 								 * from modification.
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								 *
 								 * Some members, marked 'const', are immutable.  Accessing other members
 								 * requires synchronization, as noted in more detail below.
 								 */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								struct dp_netdev_flow {
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    const struct flow flow;      /* Unmasked flow that created this entry. */
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    /* Hash table index by unmasked flow. */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 								                                 /* 'flow_table'. */
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    const ovs_u128 ufid;         /* Unique flow identifier. */
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								                                 /* flow. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								    /* Number of references.
 								     * The classifier owns one reference.
 								     * Any thread trying to keep a rule from being freed should hold its own
 								     * reference. */
 								    struct ovs_refcount ref_cnt;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    bool dead;
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    uint32_t mark;               /* Unique flow mark assigned to a flow */
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    /* Statistics. */
 								    struct dp_netdev_flow_stats stats;
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Remove unused members

Simplify code and update comments after commit 61e7deb1.
("dpif-netdev: Use RCU to protect data.")

Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>

											
										
										
											2014-04-15 14:59:30 +09:00
+								    /* Actions. */
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    OVSRCU_TYPE(struct dp_netdev_actions *) actions;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    /* While processing a group of input packets, the datapath uses the next
 								     * member to store a pointer to the output batch for the flow.  It is
 								     * reset after the batch has been sent out (See dp_netdev_queue_batches(),
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								     * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 								    struct packet_batch_per_flow *batch;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Packet classification. */
 								    struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 								    /* 'cr' must be the last member. */
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								static void dp_netdev_flow_unref(struct dp_netdev_flow *);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								                                         struct flow *, bool);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								/* A set of datapath actions within a "struct dp_netdev_flow".
 								 *
 								 *
 								 * Thread-safety
 								 * =============
 								 *
-												dpif-netdev: Remove unused members

Simplify code and update comments after commit 61e7deb1.
("dpif-netdev: Use RCU to protect data.")

Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>

											
										
										
											2014-04-15 14:59:30 +09:00
+								 * A struct dp_netdev_actions 'actions' is protected with RCU. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								struct dp_netdev_actions {
 								    /* These members are immutable: they do not change during the struct's
 								     * lifetime.  */
 								    unsigned int size;          /* Size of 'actions', in bytes. */
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								    struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								};
 								struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 								                                                   size_t);
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								struct dp_netdev_actions *dp_netdev_flow_get_actions(
 								    const struct dp_netdev_flow *);
 								static void dp_netdev_actions_free(struct dp_netdev_actions *);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								struct polled_queue {
-												dpif-netdev: Change polled_queue to use dp_netdev_rxq.

Soon we will want to store processing cycle counts in the dp_netdev_rxq,
so use that as a basis for the polled_queue that pmd_thread_main uses.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:39:40 -07:00
+								    struct dp_netdev_rxq *rxq;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    odp_port_t port_no;
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    bool emc_enabled;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								};
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 								struct rxq_poll {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct dp_netdev_rxq *rxq;
 								    struct hmap_node node;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								};
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 								 * 'tnl_port_cache' or 'tx_ports'. */
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								struct tx_port {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    struct dp_netdev_port *port;
 								    int qid;
 								    long long last_used;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct hmap_node node;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    long long flush_time;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								    struct dp_packet_batch output_pkts;
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								};
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								/* A set of properties for the current processing loop that is not directly
 								 * associated with the pmd thread itself, but with the packets being
 								 * processed or the short-term system configuration (for example, time).
 								 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 								struct dp_netdev_pmd_thread_ctx {
 								    /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 								    long long now;
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    /* RX queue from which last packet was received. */
 								    struct dp_netdev_rxq *last_rxq;
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    /* EMC insertion probability context for the current processing cycle. */
 								    uint32_t emc_insert_min;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								};
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								/* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 								 * the performance overhead of interrupt processing.  Therefore netdev can
 								 * not implement rx-wait for these devices.  dpif-netdev needs to poll
 								 * these device to check for recv buffer.  pmd-thread does polling for
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 * devices assigned to itself.
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								 *
 								 * DPDK used PMD for accessing NIC.
 								 *
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 								 * I/O of all non-pmd threads.  There will be no actual thread created
 								 * for the instance.
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 *
-												dpif-netdev: Fix comments for dp_netdev_pmd_thread struct.

The sorted subtable ranking patch introduced a classifier instance per
ingress port with its subtables ranked on the frequency of hits. The PMD
thread can have more classifier instances now and solely depends on the
number of ingress ports currently handled by the pmd thread.

Fixes: 3453b4d62a98 ("dpif-netdev: dpcls per in_port with sorted subtables")
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-15 16:57:00 +01:00
+								 * Each struct has its own flow cache and classifier per managed ingress port.
 								 * For packets received on ingress port, a look up is done on corresponding PMD
 								 * thread's flow cache and in case of a miss, lookup is performed in the
 								 * corresponding classifier of port.  Packets are executed with the found
 								 * actions in either case.
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 * */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								struct dp_netdev_pmd_thread {
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
+								    struct dp_netdev *dp;
 								    struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 								    struct cmap_node node;          /* In 'dp->poll_threads'. */
 								    pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
 								    struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Per thread exact-match cache.  Note, the instance for cpu core
 								     * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								     * need to be protected by 'non_pmd_mutex'.  Every other instance
 								     * will only be accessed by its own pmd thread. */
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Flow-Table and classifiers
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								     *
 								     * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								     * changes to 'classifiers' must be made while still holding the
 								     * 'flow_mutex'.
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								     */
 								    struct ovs_mutex flow_mutex;
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
+								    struct cmap flow_table OVS_GUARDED; /* Flow table. */
 								    /* One classifier per in_port polled by the pmd */
 								    struct cmap classifiers;
 								    /* Periodically sort subtable vectors according to hit frequencies */
 								    long long int next_optimization;
 								    /* End of the next time interval for which processing cycles
 								       are stored for each polled rxq. */
 								    long long int rxq_next_cycle_store;
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								    /* Last interval timestamp. */
 								    uint64_t intrvl_tsc_prev;
 								    /* Last interval cycles. */
 								    atomic_ullong intrvl_cycles;
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    /* Current context of the PMD thread. */
 								    struct dp_netdev_pmd_thread_ctx ctx;
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
 								    struct latch exit_latch;        /* For terminating the pmd thread. */
 								    struct seq *reload_seq;
 								    uint64_t last_reload_seq;
 								    atomic_bool reload;             /* Do we need to reload ports? */
 								    pthread_t thread;
 								    unsigned core_id;               /* CPU core id of this pmd thread. */
 								    int numa_id;                    /* numa node id of this pmd thread. */
 								    bool isolated;
 								    /* Queue id used by this pmd thread to send packets on all netdevs if
 								     * XPS disabled for this netdev. All static_tx_qid's are unique and less
 								     * than 'cmap_count(dp->poll_threads)'. */
 								    uint32_t static_tx_qid;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    /* Number of filled output batches. */
 								    int n_output_batches;
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
+								    struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 								    /* List of rx queues to poll. */
 								    struct hmap poll_list OVS_GUARDED;
 								    /* Map of 'tx_port's used for transmission.  Written by the main thread,
 								     * read by the pmd thread. */
 								    struct hmap tx_ports OVS_GUARDED;
 								    /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 								     * ports (that support push_tunnel/pop_tunnel), the other contains ports
 								     * with at least one txq (that support send).  A port can be in both.
 								     *
 								     * There are two separate maps to make sure that we don't try to execute
 								     * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 								     *
 								     * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 								     * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 								     * other instance will only be accessed by its own pmd thread. */
 								    struct hmap tnl_port_cache;
 								    struct hmap send_port_cache;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    /* Keep track of detailed PMD performance statistics. */
 								    struct pmd_perf_stats perf_stats;
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    /* Stats from previous iteration used by automatic pmd
 								     * load balance logic. */
 								    uint64_t prev_stats[PMD_N_STATS];
 								    atomic_count pmd_overloaded;
-												Revert "dpif_netdev: Refactor dp_netdev_pmd_thread structure."

This reverts commit a807c15796ddc43ba1ffb2a6b0bd2ad4e2b73941.

Padding and aligning of dp_netdev_pmd_thread structure members
is useless, broken in a several ways and only greatly degrades
maintainability and extensibility of the structure.

Issues:

    1. It's not working because all the instances of struct
       dp_netdev_pmd_thread allocated only by usual malloc. All the
       memory is not aligned to cachelines -> structure almost never
       starts at aligned memory address. This means that any further
       paddings and alignments inside the structure are completely
       useless. Fo example:

       Breakpoint 1, pmd_thread_main
       (gdb) p pmd
       $49 = (struct dp_netdev_pmd_thread *) 0x1b1af20
       (gdb) p &pmd->cacheline1
       $51 = (OVS_CACHE_LINE_MARKER *) 0x1b1af60
       (gdb) p &pmd->cacheline0
       $52 = (OVS_CACHE_LINE_MARKER *) 0x1b1af20
       (gdb) p &pmd->flow_cache
       $53 = (struct emc_cache *) 0x1b1afe0

       All of the above addresses shifted from cacheline start by 32B.

       Can we fix it properly? NO.
       OVS currently doesn't have appropriate API to allocate aligned
       memory. The best candidate is 'xmalloc_cacheline()' but it
       clearly states that "The memory returned will not be at the
       start of a cache line, though, so don't assume such alignment".
       And also, this function will never return aligned memory on
       Windows or MacOS.

    2. CACHE_LINE_SIZE is not constant. Different architectures have
       different cache line sizes, but the code assumes that
       CACHE_LINE_SIZE is always equal to 64 bytes. All the structure
       members are grouped by 64 bytes and padded to CACHE_LINE_SIZE.
       This leads to a huge holes in a structures if CACHE_LINE_SIZE
       differs from 64. This is opposite to portability. If I want
       good performance of cmap I need to have CACHE_LINE_SIZE equal
       to the real cache line size, but I will have huge holes in the
       structures. If you'll take a look to struct rte_mbuf from DPDK
       you'll see that it uses 2 defines: RTE_CACHE_LINE_SIZE and
       RTE_CACHE_LINE_MIN_SIZE to avoid holes in mbuf structure.

    3. Sizes of system/libc defined types are not constant for all the
       systems. For example, sizeof(pthread_mutex_t) == 48 on my
       ARMv8 machine, but only 40 on x86. The difference could be
       much bigger on Windows or MacOS systems. But the code assumes
       that sizeof(struct ovs_mutex) is always 48 bytes. This may lead
       to broken alignment/big holes in case of padding/wrong comments
       about amount of free pad bytes.

    4. Sizes of the many fileds in structure depends on defines like
       DP_N_STATS, PMD_N_CYCLES, EM_FLOW_HASH_ENTRIES and so on.
       Any change in these defines or any change in any structure
       contained by thread should lead to the not so simple
       refactoring of the whole dp_netdev_pmd_thread structure. This
       greatly reduces maintainability and complicates development of
       a new features.

    5. There is no reason to align flow_cache member because it's
       too big and we usually access random entries by single thread
       only.

So, the padding/alignment only creates some visibility of performance
optimization but does nothing useful in reality. It only complicates
maintenance and adds huge holes for non-x86 architectures and non-Linux
systems. Performance improvement stated in a original commit message
should be random and not valuable. I see no performance difference.

Most of the above issues are also true for some other padded/aligned
structures like 'struct netdev_dpdk'. They will be treated separately.

CC: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ben Pfaff <blp@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-22 15:11:45 +03:00
+								    /* Set to true if the pmd thread needs to be reloaded. */
 								    bool need_reload;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								};
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								/* Interface to netdev-based datapath. */
 								struct dpif_netdev {
 								    struct dpif dpif;
 								    struct dp_netdev *dp;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    uint64_t last_port_seq;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                              struct dp_netdev_port **portp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static int get_port_by_name(struct dp_netdev *dp, const char *devname,
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								                            struct dp_netdev_port **portp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static void dp_netdev_free(struct dp_netdev *)
 								    OVS_REQUIRES(dp_netdev_mutex);
 								static int do_add_port(struct dp_netdev *dp, const char *devname,
 								                       const char *type, odp_port_t port_no)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex);
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								static int dpif_netdev_open(const struct dpif_class *, const char *name,
 								                            bool create, struct dpif **);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                                      struct dp_packet_batch *,
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                                      bool should_steal,
 								                                      const struct flow *flow,
-												datapath: Refactor actions in terms of match fields.

Almost all current actions can be expressed in the form of
push/pop/set <field>, where field is one of the match fields. We can
create three base actions and take a field. This has both a nice
symmetry and avoids inconsistencies where we can match on the vlan
TPID but not set it.
Following patch converts all actions to this new format.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

Bug #7115

											
										
										
											2011-10-21 14:38:54 -07:00
+								                                      const struct nlattr *actions,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                                      size_t actions_len);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                            struct dp_packet_batch *, odp_port_t port_no);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                                  struct dp_packet_batch *);
-												netdev-dpif: Add metadata to dpif-packet.

Today dpif-netdev has single metadat for given batch, since one
batch belongs to one port, but soon packets fro single tunnel ports
can belong to different ports, so we need to have per packet metadata.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 20:23:58 -07:00
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void dp_netdev_disable_upcall(struct dp_netdev *);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								                                    struct dp_netdev *dp, unsigned core_id,
 								                                    int numa_id);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void *pmd_thread_main(void *);
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								                                                      unsigned core_id);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static struct dp_netdev_pmd_thread *
 								dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								static void dp_netdev_del_pmd(struct dp_netdev *dp,
 								                              struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 								static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                                         struct dp_netdev_port *port)
 								    OVS_REQUIRES(pmd->port_mutex);
 								static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 								                                           struct tx_port *tx)
 								    OVS_REQUIRES(pmd->port_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                                     struct dp_netdev_rxq *rxq)
 								    OVS_REQUIRES(pmd->port_mutex);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 								                                       struct rxq_poll *poll)
 								    OVS_REQUIRES(pmd->port_mutex);
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								static int
 								dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 								                                   bool force);
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void reconfigure_datapath(struct dp_netdev *dp)
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    OVS_REQUIRES(dp->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 								static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 								static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								    OVS_REQUIRES(pmd->port_mutex);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static inline void
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 								                           struct polled_queue *poll_list, int poll_cnt);
 								static void
 								dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 								                         enum rxq_cycles_counter_type type,
 								                         unsigned long long cycles);
 								static uint64_t
 								dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 								                         enum rxq_cycles_counter_type type);
 								static void
 								dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 								                           unsigned long long cycles);
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								static uint64_t
 								dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								static void
 								dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                               bool purge);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                                      struct tx_port *tx);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								static inline bool emc_entry_alive(struct emc_entry *ce);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static void emc_clear_entry(struct emc_entry *ce);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static void smc_clear_entry(struct smc_bucket *b, int idx);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Add ovs-appctl dpif-netdev/pmd-rxq-rebalance.

Rxqs consumed processing cycles are used to improve the balance
of how rxqs are assigned to pmds. Currently some reconfiguration
is needed to perform a reassignment.

Add an ovs-appctl command to perform a new assignment in order
to balance based on the latest rxq processing cycle information.

Note: Jan requested this for testing purposes.

Suggested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:54:26 -07:00
+								static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								static inline bool
 								pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 								                                  struct dp_netdev_flow *flow);
-												dpif-netdev: Add ovs-appctl dpif-netdev/pmd-rxq-rebalance.

Rxqs consumed processing cycles are used to improve the balance
of how rxqs are assigned to pmds. Currently some reconfiguration
is needed to perform a reassignment.

Add an ovs-appctl command to perform a new assignment in order
to balance based on the latest rxq processing cycle information.

Note: Jan requested this for testing purposes.

Suggested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:54:26 -07:00
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static void
 								emc_cache_init(struct emc_cache *flow_cache)
 								{
 								    int i;
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								    flow_cache->sweep_idx = 0;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 								        flow_cache->entries[i].flow = NULL;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        flow_cache->entries[i].key.hash = 0;
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								        flow_cache->entries[i].key.len = sizeof(struct miniflow);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								        flowmap_init(&flow_cache->entries[i].key.mf.map);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static void
 								smc_cache_init(struct smc_cache *smc_cache)
 								{
 								    int i, j;
 								    for (i = 0; i < SMC_BUCKET_CNT; i++) {
 								        for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 								            smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
 								        }
 								    }
 								}
 								static void
 								dfc_cache_init(struct dfc_cache *flow_cache)
 								{
 								    emc_cache_init(&flow_cache->emc_cache);
 								    smc_cache_init(&flow_cache->smc_cache);
 								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static void
 								emc_cache_uninit(struct emc_cache *flow_cache)
 								{
 								    int i;
 								    for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 								        emc_clear_entry(&flow_cache->entries[i]);
 								    }
 								}
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static void
 								smc_cache_uninit(struct smc_cache *smc)
 								{
 								    int i, j;
 								    for (i = 0; i < SMC_BUCKET_CNT; i++) {
 								        for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 								            smc_clear_entry(&(smc->buckets[i]), j);
 								        }
 								    }
 								}
 								static void
 								dfc_cache_uninit(struct dfc_cache *flow_cache)
 								{
 								    smc_cache_uninit(&flow_cache->smc_cache);
 								    emc_cache_uninit(&flow_cache->emc_cache);
 								}
-												dpif-netdev: Garbage collect the exact match cache periodically.

On current master, the exact match cache entry can keep reference to
'struct dp_netdev_flow' even after the flow is removed from the flow
table.  This means the free of allocated memory of the flow is delayed
until the exact match cache entry is cleared or replaced.

If the allocated memory is ahead of chunks of freed memory on heap,
the delay will prevent the reclaim of those freed chunks, causing
falsely high memory utilization.

To fix the issue, this commit makes the owning thread conduct periodic
garbage collection on the exact match cache and clear dead entries.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

---
PATCH -> V2:
- Adopt Jarno's suggestion and conduct slow sweep to avoid introducing
  jitter.

											
										
										
											2014-11-14 15:54:56 -08:00
+								/* Check and clear dead flow references slowly (one entry at each
 								 * invocation).  */
 								static void
 								emc_cache_slow_sweep(struct emc_cache *flow_cache)
 								{
 								    struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 								    if (!emc_entry_alive(entry)) {
 								        emc_clear_entry(entry);
 								    }
 								    flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 								}
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								/* Updates the time in PMD threads context and should be called in three cases:
 								 *
 								 *     1. PMD structure initialization:
 								 *         - dp_netdev_configure_pmd()
 								 *
 								 *     2. Before processing of the new packet batch:
 								 *         - dpif_netdev_execute()
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								 *         - dp_netdev_process_rxq_port()
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								 *
 								 *     3. At least once per polling iteration in main polling threads if no
 								 *        packets received on current iteration:
 								 *         - dpif_netdev_run()
 								 *         - pmd_thread_main()
 								 *
 								 * 'pmd->ctx.now' should be used without update in all other cases if possible.
 								 */
 								static inline void
 								pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
 								{
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								    pmd->ctx.now = time_usec();
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								}
-												dpif: Generalize test for dummy dpifs beyond the name.

When --enable-dummy=system or --enable-dummy=override is in use, dpifs
other than "dummy" are actually dummy dpifs, so use a more reliable test.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 15:08:31 -07:00
+								/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 								bool
 								dpif_is_netdev(const struct dpif *dpif)
 								{
 								    return dpif->dpif_class->open == dpif_netdev_open;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dpif_netdev *
 								dpif_netdev_cast(const struct dpif *dpif)
 								{
-												dpif: Generalize test for dummy dpifs beyond the name.

When --enable-dummy=system or --enable-dummy=override is in use, dpifs
other than "dummy" are actually dummy dpifs, so use a more reliable test.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 15:08:31 -07:00
+								    ovs_assert(dpif_is_netdev(dpif));
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 								}
 								static struct dp_netdev *
 								get_dp_netdev(const struct dpif *dpif)
 								{
 								    return dpif_netdev_cast(dpif)->dp;
 								}
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								enum pmd_info_type {
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 								    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
 								    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								};
 								static void
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								{
 								    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
 								                        ? "main thread" : "pmd thread");
 								    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
 								        ds_put_format(reply, " numa_id %d", pmd->numa_id);
 								    }
-												netdev-dpdk: Properly support non pmd threads.

We used to reserve DPDK lcore 0 for non pmd operations, making it
difficult to use core 0 for packet processing.
DPDK 2.0 properly support non EAL threads with lcore LCORE_ID_ANY.

Using non EAL threads for non pmd threads, we do not need to reserve
any core for non pmd operations

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:20 +01:00
+								    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								        ds_put_format(reply, " core_id %u", pmd->core_id);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    }
 								    ds_put_cstr(reply, ":\n");
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								}
 								static void
 								pmd_info_show_stats(struct ds *reply,
 								                    struct dp_netdev_pmd_thread *pmd)
 								{
 								    uint64_t stats[PMD_N_STATS];
 								    uint64_t total_cycles, total_packets;
 								    double passes_per_pkt = 0;
 								    double lookups_per_hit = 0;
 								    double packets_per_batch = 0;
 								    pmd_perf_read_counters(&pmd->perf_stats, stats);
 								    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
 								                         + stats[PMD_CYCLES_ITER_BUSY];
 								    total_packets = stats[PMD_STAT_RECV];
 								    format_pmd_thread(reply, pmd);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    if (total_packets > 0) {
 								        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
 								                            / (double) total_packets;
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
+								    }
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    if (stats[PMD_STAT_MASKED_HIT] > 0) {
 								        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
 								                            / (double) stats[PMD_STAT_MASKED_HIT];
 								    }
 								    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
 								        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
 								                            / (double) stats[PMD_STAT_SENT_BATCHES];
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
+								    }
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    ds_put_format(reply,
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                  "  packets received: %"PRIu64"\n"
 								                  "  packet recirculations: %"PRIu64"\n"
 								                  "  avg. datapath passes per packet: %.02f\n"
 								                  "  emc hits: %"PRIu64"\n"
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                  "  smc hits: %"PRIu64"\n"
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                  "  megaflow hits: %"PRIu64"\n"
 								                  "  avg. subtable lookups per megaflow hit: %.02f\n"
 								                  "  miss with success upcall: %"PRIu64"\n"
 								                  "  miss with failed upcall: %"PRIu64"\n"
 								                  "  avg. packets per output batch: %.02f\n",
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                  total_packets, stats[PMD_STAT_RECIRC],
 								                  passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                  stats[PMD_STAT_SMC_HIT],
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                  stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
 								                  stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
+								                  packets_per_batch);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								    if (total_cycles == 0) {
 								        return;
 								    }
 								    ds_put_format(reply,
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
 								                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                  stats[PMD_CYCLES_ITER_IDLE],
 								                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
 								                  stats[PMD_CYCLES_ITER_BUSY],
 								                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								    if (total_packets == 0) {
 								        return;
 								    }
 								    ds_put_format(reply,
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                  total_cycles / (double) total_packets,
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								                  total_cycles, total_packets);
 								    ds_put_format(reply,
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                  "  avg processing cycles per packet: "
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                  "%.02f (%"PRIu64"/%"PRIu64")\n",
 								                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
 								                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								}
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								static void
 								pmd_info_show_perf(struct ds *reply,
 								                   struct dp_netdev_pmd_thread *pmd,
 								                   struct pmd_perf_params *par)
 								{
 								    if (pmd->core_id != NON_PMD_CORE_ID) {
 								        char *time_str =
 								                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
 								        long long now = time_msec();
 								        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
 								        ds_put_cstr(reply, "\n");
 								        ds_put_format(reply, "Time: %s\n", time_str);
 								        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
 								        ds_put_cstr(reply, "\n");
 								        format_pmd_thread(reply, pmd);
 								        ds_put_cstr(reply, "\n");
 								        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
 								        if (pmd_perf_metrics_enabled(pmd)) {
 								            /* Prevent parallel clearing of perf metrics. */
 								            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
 								            if (par->histograms) {
 								                ds_put_cstr(reply, "\n");
 								                pmd_perf_format_histograms(reply, &pmd->perf_stats);
 								            }
 								            if (par->iter_hist_len > 0) {
 								                ds_put_cstr(reply, "\n");
 								                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
 								                        par->iter_hist_len);
 								            }
 								            if (par->ms_hist_len > 0) {
 								                ds_put_cstr(reply, "\n");
 								                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
 								                        par->ms_hist_len);
 								            }
 								            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
 								        }
 								        free(time_str);
 								    }
 								}
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static int
 								compare_poll_list(const void *a_, const void *b_)
 								{
 								    const struct rxq_poll *a = a_;
 								    const struct rxq_poll *b = b_;
 								    const char *namea = netdev_rxq_get_name(a->rxq->rx);
 								    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
 								    int cmp = strcmp(namea, nameb);
 								    if (!cmp) {
 								        return netdev_rxq_get_queue_id(a->rxq->rx)
 								               - netdev_rxq_get_queue_id(b->rxq->rx);
 								    } else {
 								        return cmp;
 								    }
 								}
 								static void
 								sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
 								                 size_t *n)
 								{
 								    struct rxq_poll *ret, *poll;
 								    size_t i;
 								    *n = hmap_count(&pmd->poll_list);
 								    if (!*n) {
 								        ret = NULL;
 								    } else {
 								        ret = xcalloc(*n, sizeof *ret);
 								        i = 0;
 								        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
 								            ret[i] = *poll;
 								            i++;
 								        }
 								        ovs_assert(i == *n);
-												dpif-netdev: Skip invoking qsort on empty list.

sorted_poll_list() returns the sorted list of rxqs mapped to PMD thread
along with the rxq count. Skip sorting the list if there are no rxqs
mapped to the PMD thread. This can be reproduced with manual pinning and
'dpif-netdev/pmd-rxq-show' command.

Also Clang reports that null argument is passed to qsort in this case.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-19 19:54:00 +01:00
+								        qsort(ret, *n, sizeof *ret, compare_poll_list);
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    }
 								    *list = ret;
 								}
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								static void
 								pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 								{
 								    if (pmd->core_id != NON_PMD_CORE_ID) {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        struct rxq_poll *list;
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								        size_t n_rxq;
 								        uint64_t total_cycles = 0;
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        ds_put_format(reply,
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								                      pmd->numa_id, pmd->core_id, (pmd->isolated)
 								                                                  ? "true" : "false");
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								        sorted_poll_list(pmd, &list, &n_rxq);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								        /* Get the total pmd cycles for an interval. */
 								        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
 								        /* Estimate the cycles to cover all intervals. */
 								        total_cycles *= PMD_RXQ_INTERVAL_MAX;
 								        for (int i = 0; i < n_rxq; i++) {
 								            struct dp_netdev_rxq *rxq = list[i].rxq;
 								            const char *name = netdev_rxq_get_name(rxq->rx);
 								            uint64_t proc_cycles = 0;
 								            for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
 								                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								            }
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                          netdev_rxq_get_queue_id(list[i].rxq->rx));
-												treewide: Convert leading tabs to spaces.

It's always been OVS coding style to use spaces rather than tabs for
indentation, but some tabs have snuck in over time.  This commit converts
them to spaces.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-05-25 17:11:07 -07:00
+								            ds_put_format(reply, "  pmd usage: ");
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								            if (total_cycles) {
 								                ds_put_format(reply, "%2"PRIu64"",
 								                              proc_cycles * 100 / total_cycles);
 								                ds_put_cstr(reply, " %");
 								            } else {
 								                ds_put_format(reply, "%s", "NOT AVAIL");
 								            }
 								            ds_put_cstr(reply, "\n");
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								        }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        free(list);
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								    }
 								}
-												dpif-netdev: The pmd-*-show commands will show info in core order

The "ovs-appctl dpif-netdev/pmd-rxq-show" and "ovs-appctl
dpif-netdev/pmd-stats-show" commands show their output per core_id,
sorted on the hash location. My OCD was kicking in when using these
commands, hence this change to display them in natural core_id order.

In addition I had to change a test case that would fail if the cores
where not in order in the hash list. This is due to OVS assigning
queues to cores based on the order in the hash list. The test case now
checks if any core has the set of queues in the given order.

Manually tested this on my setup, and ran clang-analyze.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-09 09:08:54 +02:00
+								static int
 								compare_poll_thread_list(const void *a_, const void *b_)
 								{
 								    const struct dp_netdev_pmd_thread *a, *b;
 								    a = *(struct dp_netdev_pmd_thread **)a_;
 								    b = *(struct dp_netdev_pmd_thread **)b_;
 								    if (a->core_id < b->core_id) {
 								        return -1;
 								    }
 								    if (a->core_id > b->core_id) {
 								        return 1;
 								    }
 								    return 0;
 								}
 								/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
 								 * this list, as long as we do not go to quiescent state. */
 								static void
 								sorted_poll_thread_list(struct dp_netdev *dp,
 								                        struct dp_netdev_pmd_thread ***list,
 								                        size_t *n)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct dp_netdev_pmd_thread **pmd_list;
 								    size_t k = 0, n_pmds;
 								    n_pmds = cmap_count(&dp->poll_threads);
 								    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (k >= n_pmds) {
 								            break;
 								        }
 								        pmd_list[k++] = pmd;
 								    }
 								    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
 								    *list = pmd_list;
 								    *n = k;
 								}
-												dpif-netdev: Add ovs-appctl dpif-netdev/pmd-rxq-rebalance.

Rxqs consumed processing cycles are used to improve the balance
of how rxqs are assigned to pmds. Currently some reconfiguration
is needed to perform a reassignment.

Add an ovs-appctl command to perform a new assignment in order
to balance based on the latest rxq processing cycle information.

Note: Jan requested this for testing purposes.

Suggested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:54:26 -07:00
+								static void
 								dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
 								                          const char *argv[], void *aux OVS_UNUSED)
 								{
 								    struct ds reply = DS_EMPTY_INITIALIZER;
 								    struct dp_netdev *dp = NULL;
 								    ovs_mutex_lock(&dp_netdev_mutex);
 								    if (argc == 2) {
 								        dp = shash_find_data(&dp_netdevs, argv[1]);
 								    } else if (shash_count(&dp_netdevs) == 1) {
 								        /* There's only one datapath */
 								        dp = shash_first(&dp_netdevs)->data;
 								    }
 								    if (!dp) {
 								        ovs_mutex_unlock(&dp_netdev_mutex);
 								        unixctl_command_reply_error(conn,
 								                                    "please specify an existing datapath");
 								        return;
 								    }
 								    dp_netdev_request_reconfigure(dp);
 								    ovs_mutex_unlock(&dp_netdev_mutex);
 								    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
 								    unixctl_command_reply(conn, ds_cstr(&reply));
 								    ds_destroy(&reply);
 								}
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								static void
 								dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
 								                     void *aux)
 								{
 								    struct ds reply = DS_EMPTY_INITIALIZER;
-												dpif-netdev: The pmd-*-show commands will show info in core order

The "ovs-appctl dpif-netdev/pmd-rxq-show" and "ovs-appctl
dpif-netdev/pmd-stats-show" commands show their output per core_id,
sorted on the hash location. My OCD was kicking in when using these
commands, hence this change to display them in natural core_id order.

In addition I had to change a test case that would fail if the cores
where not in order in the hash list. This is due to OVS assigning
queues to cores based on the order in the hash list. The test case now
checks if any core has the set of queues in the given order.

Manually tested this on my setup, and ran clang-analyze.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-09 09:08:54 +02:00
+								    struct dp_netdev_pmd_thread **pmd_list;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    struct dp_netdev *dp = NULL;
 								    enum pmd_info_type type = *(enum pmd_info_type *) aux;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    unsigned int core_id;
 								    bool filter_on_pmd = false;
 								    size_t n;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    while (argc > 1) {
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        if (!strcmp(argv[1], "-pmd") && argc > 2) {
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								            if (str_to_uint(argv[2], 10, &core_id)) {
 								                filter_on_pmd = true;
 								            }
 								            argc -= 2;
 								            argv += 2;
 								        } else {
 								            dp = shash_find_data(&dp_netdevs, argv[1]);
 								            argc -= 1;
 								            argv += 1;
 								        }
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    }
 								    if (!dp) {
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        if (shash_count(&dp_netdevs) == 1) {
 								            /* There's only one datapath */
 								            dp = shash_first(&dp_netdevs)->data;
 								        } else {
 								            ovs_mutex_unlock(&dp_netdev_mutex);
 								            unixctl_command_reply_error(conn,
 								                                        "please specify an existing datapath");
 								            return;
 								        }
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    }
-												dpif-netdev: The pmd-*-show commands will show info in core order

The "ovs-appctl dpif-netdev/pmd-rxq-show" and "ovs-appctl
dpif-netdev/pmd-stats-show" commands show their output per core_id,
sorted on the hash location. My OCD was kicking in when using these
commands, hence this change to display them in natural core_id order.

In addition I had to change a test case that would fail if the cores
where not in order in the hash list. This is due to OVS assigning
queues to cores based on the order in the hash list. The test case now
checks if any core has the set of queues in the given order.

Manually tested this on my setup, and ran clang-analyze.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-09 09:08:54 +02:00
+								    sorted_poll_thread_list(dp, &pmd_list, &n);
 								    for (size_t i = 0; i < n; i++) {
 								        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
 								        if (!pmd) {
 								            break;
 								        }
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        if (filter_on_pmd && pmd->core_id != core_id) {
 								            continue;
 								        }
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								        if (type == PMD_INFO_SHOW_RXQ) {
 								            pmd_info_show_rxq(&reply, pmd);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        } else if (type == PMD_INFO_CLEAR_STATS) {
 								            pmd_perf_stats_clear(&pmd->perf_stats);
 								        } else if (type == PMD_INFO_SHOW_STATS) {
 								            pmd_info_show_stats(&reply, pmd);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        } else if (type == PMD_INFO_PERF_SHOW) {
 								            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								        }
 								    }
-												dpif-netdev: The pmd-*-show commands will show info in core order

The "ovs-appctl dpif-netdev/pmd-rxq-show" and "ovs-appctl
dpif-netdev/pmd-stats-show" commands show their output per core_id,
sorted on the hash location. My OCD was kicking in when using these
commands, hence this change to display them in natural core_id order.

In addition I had to change a test case that would fail if the cores
where not in order in the hash list. This is due to OVS assigning
queues to cores based on the order in the hash list. The test case now
checks if any core has the set of queues in the given order.

Manually tested this on my setup, and ran clang-analyze.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-09 09:08:54 +02:00
+								    free(pmd_list);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								    ovs_mutex_unlock(&dp_netdev_mutex);
 								    unixctl_command_reply(conn, ds_cstr(&reply));
 								    ds_destroy(&reply);
 								}
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
 								static void
 								pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
 								                          const char *argv[],
 								                          void *aux OVS_UNUSED)
 								{
 								    struct pmd_perf_params par;
 								    long int it_hist = 0, ms_hist = 0;
 								    par.histograms = true;
 								    while (argc > 1) {
 								        if (!strcmp(argv[1], "-nh")) {
 								            par.histograms = false;
 								            argc -= 1;
 								            argv += 1;
 								        } else if (!strcmp(argv[1], "-it") && argc > 2) {
 								            it_hist = strtol(argv[2], NULL, 10);
 								            if (it_hist < 0) {
 								                it_hist = 0;
 								            } else if (it_hist > HISTORY_LEN) {
 								                it_hist = HISTORY_LEN;
 								            }
 								            argc -= 2;
 								            argv += 2;
 								        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
 								            ms_hist = strtol(argv[2], NULL, 10);
 								            if (ms_hist < 0) {
 								                ms_hist = 0;
 								            } else if (ms_hist > HISTORY_LEN) {
 								                ms_hist = HISTORY_LEN;
 								            }
 								            argc -= 2;
 								            argv += 2;
 								        } else {
 								            break;
 								        }
 								    }
 								    par.iter_hist_len = it_hist;
 								    par.ms_hist_len = ms_hist;
 								    par.command_type = PMD_INFO_PERF_SHOW;
 								    dpif_netdev_pmd_info(conn, argc, argv, &par);
 								}
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
 								static int
 								dpif_netdev_init(void)
 								{
 								    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								                              clear_aux = PMD_INFO_CLEAR_STATS,
 								                              poll_aux = PMD_INFO_SHOW_RXQ;
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
 , 3, dpif_netdev_pmd_info,
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								                             (void *)&show_aux);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
 , 3, dpif_netdev_pmd_info,
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								                             (void *)&clear_aux);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
 , 3, dpif_netdev_pmd_info,
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								                             (void *)&poll_aux);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    unixctl_command_register("dpif-netdev/pmd-perf-show",
 								                             "[-nh] [-it iter-history-len]"
 								                             " [-ms ms-history-len]"
 								                             " [-pmd core] [dp]",
 , 8, pmd_perf_show_cmd,
 								                             NULL);
-												dpif-netdev: Add ovs-appctl dpif-netdev/pmd-rxq-rebalance.

Rxqs consumed processing cycles are used to improve the balance
of how rxqs are assigned to pmds. Currently some reconfiguration
is needed to perform a reassignment.

Add an ovs-appctl command to perform a new assignment in order
to balance based on the latest rxq processing cycle information.

Note: Jan requested this for testing purposes.

Suggested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:54:26 -07:00
+								    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
 , 1, dpif_netdev_pmd_rebalance,
 								                             NULL);
-												dpif-netdev: Detection and logging of suspicious PMD iterations

This patch enhances dpif-netdev-perf to detect iterations with
suspicious statistics according to the following criteria:

- iteration lasts longer than US_THR microseconds (default 250).
  This can be used to capture events where a PMD is blocked or
  interrupted for such a period of time that there is a risk for
  dropped packets on any of its Rx queues.

- max vhost qlen exceeds a threshold Q_THR (default 128). This can
  be used to infer virtio queue overruns and dropped packets inside
  a VM, which are not visible in OVS otherwise.

Such suspicious iterations can be logged together with their iteration
statistics to be able to correlate them to packet drop or other events
outside OVS.

A new command is introduced to enable/disable logging at run-time and
to adjust the above thresholds for suspicious iterations:

ovs-appctl dpif-netdev/pmd-perf-log-set on | off
    [-b before] [-a after] [-e|-ne] [-us usec] [-q qlen]

Turn logging on or off at run-time (on|off).

-b before:  The number of iterations before the suspicious iteration to
            be logged (default 5).
-a after:   The number of iterations after the suspicious iteration to
            be logged (default 5).
-e:         Extend logging interval if another suspicious iteration is
            detected before logging occurs.
-ne:        Do not extend logging interval (default).
-q qlen:    Suspicious vhost queue fill level threshold. Increase this
            to 512 if the Qemu supports 1024 virtio queue length.
            (default 128).
-us usec:   change the duration threshold for a suspicious iteration
            (default 250 us).

Note: Logging of suspicious iterations itself consumes a considerable
amount of processing cycles of a PMD which may be visible in the iteration
history. In the worst case this can lead OVS to detect another
suspicious iteration caused by logging.

If more than 100 iterations around a suspicious iteration have been
logged once, OVS falls back to the safe default values (-b 5/-a 5/-ne)
to avoid that logging itself causes continuos further logging.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:46 +02:00
+								    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
 								                             "on|off [-b before] [-a after] [-e|-ne] "
 								                             "[-us usec] [-q qlen]",
 , 10, pmd_perf_log_set_cmd,
 								                             NULL);
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								static int
-												dpif-netdev: enumerate dpif belonging to the right class

Since dpif_netdev_enumerate() is used for "netdev" and "dummy" class, it
incorrectly lists dpif-netdevs as "dummy" and vice versa.
This patches address the issue by changing the dpif-provider interface: a
dpif_class parameter is passed to the 'enumerate' call to match the right class.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-12 16:37:33 -07:00
+								dpif_netdev_enumerate(struct sset *all_dps,
 								                      const struct dpif_class *dpif_class)
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								{
 								    struct shash_node *node;
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    SHASH_FOR_EACH(node, &dp_netdevs) {
-												dpif-netdev: enumerate dpif belonging to the right class

Since dpif_netdev_enumerate() is used for "netdev" and "dummy" class, it
incorrectly lists dpif-netdevs as "dummy" and vice versa.
This patches address the issue by changing the dpif-provider interface: a
dpif_class parameter is passed to the 'enumerate' call to match the right class.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-12 16:37:33 -07:00
+								        struct dp_netdev *dp = node->data;
 								        if (dpif_class != dp->class) {
 								            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
 								             * If the class doesn't match, skip this dpif. */
 								             continue;
 								        }
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								        sset_add(all_dps, node->name);
 								    }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    return 0;
 								}
-												dpif-netdev: Allow stub interfaces on the dummy datapath.

Future patches will need to add netdevs to the dummy datapath which
can't actually send or receive packets.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-01-08 14:37:23 -08:00
+								static bool
 								dpif_netdev_class_is_dummy(const struct dpif_class *class)
 								{
 								    return class != &dpif_netdev_class;
 								}
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								static const char *
 								dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
 								{
 								    return strcmp(type, "internal") ? type
-												netdev-dummy: Add dummy-internal class.

"internal" netdevs are treated specially in OVS (e.g. for MTU), but
the dummy datapath remaps both "system" and "internal" devices to the
same "dummy" netdev class, so there's no way to discern those in tests.

This commit adds a new "dummy-internal" netdev type, which will be used
by the dummy datapath for internal ports, so that other parts of the
code can understand which ports are internal just by looking at the
netdev object.

The alternative solution, using the original interface type ("internal")
instead of the translated netdev type ("dummy"), is harder to implement,
because in so many places only the netdev object is available.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-02-11 13:11:10 -08:00
+								                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								                  : "tap";
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dpif *
 								create_dpif_netdev(struct dp_netdev *dp)
 								{
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    uint16_t netflow_id = hash_string(dp->name, 0);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dpif_netdev *dpif;
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    ovs_refcount_ref(&dp->ref_cnt);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    dpif = xmalloc(sizeof *dpif);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif->dp = dp;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    dpif->last_port_seq = seq_read(dp->port_seq);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    return &dpif->dpif;
 								}
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								/* Choose an unused, non-zero port number and return it on success.
 								 * Return ODPP_NONE on failure. */
 								static odp_port_t
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								choose_port(struct dp_netdev *dp, const char *name)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								{
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    uint32_t port_no;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
 								    if (dp->class != &dpif_netdev_class) {
 								        const char *p;
 								        int start_no = 0;
 								        /* If the port name begins with "br", start the number search at
 								         * 100 to make writing tests easier. */
 								        if (!strncmp(name, "br", 2)) {
 								            start_no = 100;
 								        }
 								        /* If the port name contains a number, try to assign that port number.
 								         * This can make writing unit tests easier because port numbers are
 								         * predictable. */
 								        for (p = name; *p != '\0'; p++) {
 								            if (isdigit((unsigned char) *p)) {
 								                port_no = start_no + strtol(p, NULL, 10);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
 								                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                    return u32_to_odp(port_no);
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								                }
 								                break;
 								            }
 								        }
 								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
 								        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								            return u32_to_odp(port_no);
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								        }
 								    }
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    return ODPP_NONE;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
+								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								create_dp_netdev(const char *name, const struct dpif_class *class,
 								                 struct dp_netdev **dpp)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    OVS_REQUIRES(dp_netdev_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp;
 								    int error;
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    dp = xzalloc(sizeof *dp);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    shash_add(&dp_netdevs, name, dp);
 								    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
 								    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    ovs_refcount_init(&dp->ref_cnt);
-												dpif-netdev: init atomic flag dp->destroyed

It is better to explicitly initialize the dp->destroy than to rely
on xzalloc().

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-17 22:10:53 -07:00
+								    atomic_flag_clear(&dp->destroyed);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_init(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_init(&dp->ports);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    dp->port_seq = seq_create();
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    fat_rwlock_init(&dp->upcall_rwlock);
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    dp->reconfigure_seq = seq_create();
 								    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    for (int i = 0; i < N_METER_LOCKS; ++i) {
 								        ovs_mutex_init_adaptive(&dp->meter_locks[i]);
 								    }
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    /* Disable upcalls by default. */
 								    dp_netdev_disable_upcall(dp);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    dp->upcall_aux = NULL;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dp->upcall_cb = NULL;
-												tests: Rewrite unit tests to not expect bridge with odp zero.

A future commit will make all bridges of a particular type share a
single backing datapath.  That backing datapath will have a datapath
port number of zero and bridges will be assigned other numbers.  This
commit modifies the tests so that they don't expect port zero.

It adopts the convention that bridges of type "dummy" with a name of the
form "br<n>" will be assigned a port number of 100+<n>.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-13 17:45:00 -07:00
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    conntrack_init(&dp->conntrack);
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    cmap_init(&dp->poll_threads);
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								    dp->pmd_rxq_assign_cyc = true;
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
 								    ovs_mutex_init(&dp->tx_qid_pool_mutex);
 								    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
 								    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
 								    ovsthread_key_create(&dp->per_pmd_key, NULL);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    /* non-PMD will be created before all other threads and will
 								     * allocate static_tx_qid = 0. */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    dp_netdev_set_nonpmd(dp);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
 								                                                             "internal"),
 								                        ODPP_LOCAL);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    if (error) {
 								        dp_netdev_free(dp);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								        return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    *dpp = dp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								static void
 								dp_netdev_request_reconfigure(struct dp_netdev *dp)
 								{
 								    seq_change(dp->reconfigure_seq);
 								}
 								static bool
 								dp_netdev_is_reconf_required(struct dp_netdev *dp)
 								{
 								    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								dpif_netdev_open(const struct dpif_class *class, const char *name,
-												dpif: Make dpif_class 'open' function take class instead of type name.

This makes it easier for dpif_provider implementations to share code but
distinguish the class actually in use, because comparing a pointer is
easier than comparing a string.

											
										
										
											2010-11-18 10:06:41 -08:00
+								                 bool create, struct dpif **dpifp)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    struct dp_netdev *dp;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
+								    dp = shash_find_data(&dp_netdevs, name);
 								    if (!dp) {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = (dp->class != class ? EINVAL
 								                 : create ? EEXIST
 								                 : 0);
 								    }
 								    if (!error) {
 								        *dpifp = create_dpif_netdev(dp);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								        dp->dpif = *dpifp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												clang: Add annotations for thread safety check.

This commit adds annotations for thread safety check. And the
check can be conducted by using -Wthread-safety flag in clang.

Co-authored-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-30 15:31:48 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												dpif-netdev: Simplify code by using shash for names and dropping indexes.

											
										
										
											2010-11-24 12:35:22 -08:00
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: fix dp_netdev_free()

dp_netdev_free() must free 'dp->upcall_rwlock', but when upcalls are
disabled (if the datapath is being freed upcalls should be disabled)
'dp->upcall_rwlock' is taken and freeing it causes an assertion to
fail.

This commit takes makes sure that the upcalls are disabled and
releases 'dp->upcall_rwlock' before freeing it. A simple testcase is
added to detect the failure.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 15:04:15 -07:00
+								static void
 								dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
 								    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
 								    /* Before freeing a lock we should release it */
 								    fat_rwlock_unlock(&dp->upcall_rwlock);
 								    fat_rwlock_destroy(&dp->upcall_rwlock);
 								}
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								static void
 								dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
 								    OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 								{
 								    if (dp->meters[meter_id]) {
 								        free(dp->meters[meter_id]);
 								        dp->meters[meter_id] = NULL;
 								    }
 								}
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
 								 * through the 'dp_netdevs' shash while freeing 'dp'. */
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								static void
 								dp_netdev_free(struct dp_netdev *dp)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    OVS_REQUIRES(dp_netdev_mutex)
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct dp_netdev_port *port, *next;
-												dpif-netdev: Avoid pointlessly maintaining a port count.

'n_ports' was only used for testing for nonzero, and we can rewrite the
code that does that to more straightforwardly use LIST_FOR_EACH_SAFE.

											
										
										
											2011-08-10 12:40:10 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    shash_find_and_delete(&dp_netdevs, dp->name);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								        do_del_port(dp, port);
-												datapath: Drop queue information from odp_stats.

This queue information will be available through the kernel socket layer
once we move over to Netlink socket as transports, so we might as well get
rid of the redundancy.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-04 17:00:36 -08:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    dp_netdev_destroy_all_pmds(dp, true);
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    cmap_destroy(&dp->poll_threads);
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
 								    id_pool_destroy(dp->tx_qid_pool);
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    ovs_mutex_destroy(&dp->non_pmd_mutex);
 								    ovsthread_key_delete(dp->per_pmd_key);
 								    conntrack_destroy(&dp->conntrack);
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    seq_destroy(dp->reconfigure_seq);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_destroy(dp->port_seq);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_destroy(&dp->ports);
-												dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 18:10:33 -07:00
+								    ovs_mutex_destroy(&dp->port_mutex);
-												dpif-netdev: fix dp_netdev_free()

dp_netdev_free() must free 'dp->upcall_rwlock', but when upcalls are
disabled (if the datapath is being freed upcalls should be disabled)
'dp->upcall_rwlock' is taken and freeing it causes an assertion to
fail.

This commit takes makes sure that the upcalls are disabled and
releases 'dp->upcall_rwlock' before freeing it. A simple testcase is
added to detect the failure.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-10-03 15:04:15 -07:00
 								    /* Upcalls must be disabled at this point */
 								    dp_netdev_destroy_upcall_lock(dp);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    int i;
 								    for (i = 0; i < MAX_METERS; ++i) {
 								        meter_lock(dp, i);
 								        dp_delete_meter(dp, i);
 								        meter_unlock(dp, i);
 								    }
 								    for (i = 0; i < N_METER_LOCKS; ++i) {
 								        ovs_mutex_destroy(&dp->meter_locks[i]);
 								    }
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    free(dp->pmd_cmask);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    free(CONST_CAST(char *, dp->name));
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    free(dp);
 								}
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								static void
 								dp_netdev_unref(struct dp_netdev *dp)
 								{
 								    if (dp) {
 								        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
 								         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
 								        ovs_mutex_lock(&dp_netdev_mutex);
-												Use ovs_refcount_unref_relaxed.

After a quick analysis, in most cases the access to refcounted objects
is clearly protected either with an explicit lock/mutex, or RCU. there
are only a few places where I left a call to ovs_refcount_unref().
Upon closer analysis it may well be that those could also use the
relaxed form.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-07 13:18:46 -07:00
+								        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								            dp_netdev_free(dp);
 								        }
 								        ovs_mutex_unlock(&dp_netdev_mutex);
 								    }
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
 								dpif_netdev_close(struct dpif *dpif)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    dp_netdev_unref(dp);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    free(dpif);
 								}
 								static int
-												Fix some regressions from the merge from master.

											
										
										
											2010-02-08 13:22:41 -05:00
+								dpif_netdev_destroy(struct dpif *dpif)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								    if (!atomic_flag_test_and_set(&dp->destroyed)) {
-												Use ovs_refcount_unref_relaxed.

After a quick analysis, in most cases the access to refcounted objects
is clearly protected either with an explicit lock/mutex, or RCU. there
are only a few places where I left a call to ovs_refcount_unref().
Upon closer analysis it may well be that those could also use the
relaxed form.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-07 13:18:46 -07:00
+								        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
-												dpif-netdev: Take advantage of ovs_refcount for dp_netdev.

By making "destroyed" own a reference, we can treat dp_netdev's ref_cnt
like any other in Open vSwitch.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 19:41:10 -08:00
+								            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
 								            OVS_NOT_REACHED();
 								        }
 								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
 								 * load/store semantics.  While the increment is not atomic, the load and
 								 * store operations are, making it impossible to read inconsistent values.
 								 *
 								 * This is used to update thread local stats counters. */
 								static void
 								non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
 								{
 								    unsigned long long tmp;
 								    atomic_read_relaxed(var, &tmp);
 								    tmp += n;
 								    atomic_store_relaxed(var, tmp);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif: Avoid use of  "struct ovs_dp_stats" in platform-independent modules.

Over time we wish to reduce the number of datapath-protocol.h definitions
used directly outside of Linux-specific code.  This commit removes use of
"struct ovs_dp_stats" from platform-independent code.

Bug #7559.

											
										
										
											2011-10-05 11:18:13 -07:00
+								dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    uint64_t pmd_stats[PMD_N_STATS];
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        stats->n_flows += cmap_count(&pmd->flow_table);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
 								        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
 								        stats->n_missed += pmd_stats[PMD_STAT_MISS];
 								        stats->n_lost += pmd_stats[PMD_STAT_LOST];
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								    }
-												dpif-linux: fix the size of n_masks

The command ovs-dpctl can wrongly output the masks even if the
datapath does not implement mega flows. In this case the output
will be similar to the following:

system@ovs-system:
	lookups: hit:14 missed:41 lost:0
	flows: 0
	masks: hit:18446744073709551615 total:4294967295
		hit/pkt:335395346794719104.00
	port 0: ovs-system (internal)
	port 1: gre_system (gre: df_default=false, ttl=0)
	port 2: ots-br0 (internal)
	port 3: int0 (internal)
	port 4: vnet0
	port 5: vnet1

The problem depends on the fact that n_masks stats is stored as a
uint32 in the struct ovs_dp_megaflow_stats and as a uint64 in the
struct dpif_dp_stats. UINT32_MAX instead of UINT64_MAX should be
used to detect if the datapath supports megaflows or not.

Signed-off-by: Francesco Fusco <ffusco@redhat.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-17 20:18:18 +01:00
+								    stats->n_masks = UINT32_MAX;
-												dpif-linux: collect and display mega flow mask stats

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2013-10-21 14:37:34 -07:00
+								    stats->n_mask_hit = UINT64_MAX;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
 								        ovs_mutex_lock(&pmd->port_mutex);
 								        pmd_load_cached_ports(pmd);
 								        ovs_mutex_unlock(&pmd->port_mutex);
 								        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								        return;
 								    }
 								    ovs_mutex_lock(&pmd->cond_mutex);
-												dpif-netdev: Block pmd threads if there are no ports.

There's no reason for a pmd thread to perform its main loop if there are
no queues in its poll_list.

This commit introduces a seq object on which the pmd thread can be
blocked, if there are no queues.

When the main thread wants to reload a pmd threads it must now change
the seq object (in case it's blocked) and set 'reload' to true.

This is useful to avoid wasting CPU cycles and is also necessary for a
future commit.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    seq_change(pmd->reload_seq);
-												dpif-netdev: Use a boolean instead of pmd->port_seq.

There's no need for a sequence number, since the main thread has to wait
for the pmd thread, so there's no chance that an update will be
undetected.

A seq object will be introduced for another purpose in the next commit,
and changing this to boolean makes the code more readable.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    atomic_store_relaxed(&pmd->reload, true);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
 								    ovs_mutex_unlock(&pmd->cond_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								static uint32_t
 								hash_port_no(odp_port_t port_no)
 								{
 								    return hash_int(odp_to_u32(port_no), 0);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								port_create(const char *devname, const char *type,
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								            odp_port_t port_no, struct dp_netdev_port **portp)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    struct netdev_saved_flags *sf;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dp_netdev_port *port;
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    enum netdev_flags flags;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    struct netdev *netdev;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    int error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    *portp = NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    /* Open and validate network device. */
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = netdev_open(devname, type, &netdev);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    if (error) {
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								        return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
 								    /* XXX reject non-Ethernet devices */
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    netdev_get_flags(netdev, &flags);
 								    if (flags & NETDEV_LOOPBACK) {
 								        VLOG_ERR("%s: cannot add a loopback device", devname);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								        error = EINVAL;
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								        goto out;
-												dpif-netdev: Do not allow adding loopback devices

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-07 12:35:15 +03:00
+								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
 								    if (error) {
 								        VLOG_ERR("%s: cannot set promisc flag", devname);
 								        goto out;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    port = xzalloc(sizeof *port);
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    port->port_no = port_no;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    port->netdev = netdev;
 								    port->type = xstrdup(type);
-												netdev: Factor restoring flags into new "struct netdev_saved_flags".

This gets rid of the only per-instance data in "struct netdev", which
will make it possible to merge "struct netdev_dev" into "struct netdev" in
a later commit.

Ed Maste wrote the netdev-bsd changes in this commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Co-authored-by: Ed Maste <emaste@freebsd.org>
Signed-off-by: Ed Maste <emaste@freebsd.org>
Tested-by: Ed Maste <emaste@freebsd.org>

											
										
										
											2013-05-10 08:55:25 -07:00
+								    port->sf = sf;
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    port->emc_enabled = true;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    port->need_reconfigure = true;
 								    ovs_mutex_init(&port->txq_used_mutex);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    *portp = port;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
 								    return 0;
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
 								out:
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    netdev_close(netdev);
-												dpif-netdev: Proper error handling in do_add_port().

This fixes multiple error path mistakes in do_add_port, none of which
has been a problem in practice so far. This change will make it easier
for a following commit to return in case of error.

Also, this removes an unneeded special case for tunnel ports.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 10:31:18 -08:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								static int
 								do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
 								            odp_port_t port_no)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    int error;
 								    /* Reject devices already in 'dp'. */
 								    if (!get_port_by_name(dp, devname, &port)) {
 								        return EEXIST;
 								    }
-												dpif-netdev: use the open_type when creating the local port

Instead of using the internal type, use the port_open_type when creating the
local port. That makes sure that whenever dpif_port_query is used, the netdev
open_type is returned instead of the "internal" type.

For other ports, that is already the case, as the netdev type is used when
creating the dp_netdev_port.

That changes the output of dpctl when showing the local port, and also when
trying to change its type. So, corresponding tests are fixed.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 12:06:44 -03:00
+								    error = port_create(devname, type, port_no, &port);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    if (error) {
 								        return error;
 								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    seq_change(dp->port_seq);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    reconfigure_datapath(dp);
-												dpif-netdev: Factor out port_create() from do_add_port().

Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 13:14:56 -07:00
+								    return 0;
 								}
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								static int
 								dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                     odp_port_t *port_nop)
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
 								    const char *dpif_port;
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    odp_port_t port_no;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								    if (*port_nop != ODPP_NONE) {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        port_no = *port_nop;
 								        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
-												dpif: Allow the port number to be requested when adding an interface.

The datapath allows requesting a specific port number for a port, but
the dpif interface didn't expose it.  This commit adds that support.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-07-27 23:58:24 -07:00
+								    } else {
-												netdev-vport: Don't return static data in netdev_vport_get_dpif_port().

Returning a static data buffer makes code more brittle and definitely
not thread-safe, so this commit switches to using a caller-provided
buffer instead.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-05-01 11:05:28 -07:00
+								        port_no = choose_port(dp, dpif_port);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = port_no == ODPP_NONE ? EFBIG : 0;
-												dpif: Allow the port number to be requested when adding an interface.

The datapath allows requesting a specific port number for a port, but
the dpif interface didn't expose it.  This commit adds that support.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-07-27 23:58:24 -07:00
+								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    if (!error) {
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								        *port_nop = port_no;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
-												dpif-netdev: Make port numbers predictable for dummy dpif, for unit tests.

The unit tests feed a lot of flows through the ofproto-dpif "trace"
command, which means that they need to know the port numbers of the ports
that they create.  Until now, they've had to actually query those port
numbers from the database, which is a bit of unnecessary overhead for unit
tests.

This commit makes dummy dpif port numbers predictable: if the name of a
port contains a number, then the dummy dpif uses that number, if it is
valid and available, as the port number.

This commit also simplifies the unit tests that previously queried port
numbers to depend on the new behavior.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-12 15:23:23 -08:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								    if (port_no == ODPP_LOCAL) {
 								        error = EINVAL;
 								    } else {
 								        struct dp_netdev_port *port;
 								        error = get_port_by_number(dp, port_no, &port);
 								        if (!error) {
 								            do_del_port(dp, port);
 								        }
 								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static bool
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								is_valid_port_number(odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    return port_no != ODPP_NONE;
 								}
 								static struct dp_netdev_port *
 								dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								        if (port->port_no == port_no) {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								            return port;
 								        }
 								    }
 								    return NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
 								get_port_by_number(struct dp_netdev *dp,
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								                   odp_port_t port_no, struct dp_netdev_port **portp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    if (!is_valid_port_number(port_no)) {
 								        *portp = NULL;
 								        return EINVAL;
 								    } else {
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        *portp = dp_netdev_lookup_port(dp, port_no);
-												dpif: Return ENODEV from dpif_port_query_by_*() if there's no port.

bridge_delete_or_reconfigure() deletes every interface that's not dumped
by OFPROTO_PORT_FOR_EACH().  ofproto_dpif.c:port_dump_next(), used by
OFPROTO_PORT_FOR_EACH, checks if the ofport is in the datapath by
calling port_query_by_name().  If port_query_by_name() returns an error,
the dump is interrupted.  If port_query_by_name() returns ENODEV, the
device doesn't exist and the dump can continue.

port_query_by_name() for the userspace datapath returns ENOENT instead
of ENODEV.  This is expected by dpif_port_query_by_name(), but it's not
handled correctly by port_dump_next().

dpif-netdev handles reconfiguration errors for an interface by deleting
it from the datapath, so it's possible that a device is missing. When this
happens we must make sure that port_dump_next() continues to dump other
devices, otherwise they will be deleted and the two layers will have an
inconsistent view.

This commit fixes the problem by returning ENODEV from the userspace
datapath if the port doesn't exist, and by documenting this clearly in
the dpif interfaces.

The problem was found while developing new code.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-05 20:21:23 -08:00
+								        return *portp ? 0 : ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
 								}
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								static void
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								port_destroy(struct dp_netdev_port *port)
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								{
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    if (!port) {
 								        return;
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								    }
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    netdev_close(port->netdev);
 								    netdev_restore_flags(port->sf);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    for (unsigned i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        netdev_rxq_close(port->rxqs[i].rx);
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    ovs_mutex_destroy(&port->txq_used_mutex);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    free(port->rxq_affinity_list);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    free(port->txq_used);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    free(port->rxqs);
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    free(port->type);
 								    free(port);
-												dpif-netdev: Add ref-counting for port.

DPDK Poll mode thread need to keep ref to dpif-port.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:19 -07:00
+								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
 								get_port_by_name(struct dp_netdev *dp,
 								                 const char *devname, struct dp_netdev_port **portp)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Don't run port names through netdev_vport_get_dpif_port().

The ports that exist within a dpif have already been translated through
netdev_vport_get_dpif_port(), so there is no value to translating them
again in the interfaces that query or dump ports (and possibly a drawback
if somehow the translation could change).

After this change, dpif-netdev translates port names in just one place,
the port_add path, which makes dpif-netdev act the same way as dpif-linux
in this respect.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-06 15:27:15 -07:00
+								        if (!strcmp(netdev_get_name(port->netdev), devname)) {
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            *portp = port;
 								            return 0;
 								        }
 								    }
-												dpif: Return ENODEV from dpif_port_query_by_*() if there's no port.

bridge_delete_or_reconfigure() deletes every interface that's not dumped
by OFPROTO_PORT_FOR_EACH().  ofproto_dpif.c:port_dump_next(), used by
OFPROTO_PORT_FOR_EACH, checks if the ofport is in the datapath by
calling port_query_by_name().  If port_query_by_name() returns an error,
the dump is interrupted.  If port_query_by_name() returns ENODEV, the
device doesn't exist and the dump can continue.

port_query_by_name() for the userspace datapath returns ENOENT instead
of ENODEV.  This is expected by dpif_port_query_by_name(), but it's not
handled correctly by port_dump_next().

dpif-netdev handles reconfiguration errors for an interface by deleting
it from the datapath, so it's possible that a device is missing. When this
happens we must make sure that port_dump_next() continues to dump other
devices, otherwise they will be deleted and the two layers will have an
inconsistent view.

This commit fixes the problem by returning ENODEV from the userspace
datapath if the port doesn't exist, and by documenting this clearly in
the dpif interfaces.

The problem was found while developing new code.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-05 20:21:23 -08:00
 								    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
 								     * existing port. */
 								    return ENODEV;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Returns 'true' if there is a port with pmd netdev. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static bool
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								has_pmd_port(struct dp_netdev *dp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    struct dp_netdev_port *port;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												dpif-netdev: Honor rxq affinity during pmd threads creation.

Currently, If user will set up 'pmd-rxq-affinity' to cores on
different numa node, they may not be polled, because pmd threads
will not be created there even if this cores are in 'pmd-cpu-mask'.
Fix that by creating threads on all numa nodes rxqs assigned to.

Fixes: 3eb67853c481 ("dpif-netdev: Introduce pmd-rxq-affinity.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-11-15 14:36:39 +03:00
+								        if (netdev_is_pmd(port->netdev)) {
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            return true;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								        }
 								    }
 								    return false;
 								}
-												bridge: Add test that ports that disappear get added back to the datapath.

The test added in this commit would have caught the bug fixed by commit
96be8de595150 (bridge: When ports disappear from a datapath, add them
back.).  With that commit reverted, the new test fails.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2014-05-22 09:36:00 -07:00
+								static void
 								do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    hmap_remove(&dp->ports, &port->node);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_change(dp->port_seq);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    reconfigure_datapath(dp);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Do not keep refcount for ports.

Only the main thread will delete ports after pausing every other
thread.  There's no need to keep count.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 15:25:03 -08:00
+								    port_destroy(port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								answer_port_query(const struct dp_netdev_port *port,
 								                  struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Don't run port names through netdev_vport_get_dpif_port().

The ports that exist within a dpif have already been translated through
netdev_vport_get_dpif_port(), so there is no value to translating them
again in the interfaces that query or dump ports (and possibly a drawback
if somehow the translation could change).

After this change, dpif-netdev translates port names in just one place,
the port_add path, which makes dpif-netdev act the same way as dpif-linux
in this respect.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-06 15:27:15 -07:00
+								    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								    dpif_port->type = xstrdup(port->type);
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								    dpif_port->port_no = port->port_no;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Create specific types for ofp and odp port

Until now, datapath ports and openflow ports were both represented by
unsigned integers of various sizes. With implicit conversions, etc., it is
easy to mix them up and use one where the other is expected.  This commit
creates two typedefs, ofp_port_t and odp_port_t.  Both of these two types
are marked by "__attribute__((bitwise))" so that sparse can be used to
detect any misuse.

Signed-off-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-19 16:58:44 -07:00
+								dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                                 struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error;
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    error = get_port_by_number(dp, port_no, &port);
-												dpif: Add new dpif_port_exists() function.

Provide the ability to determine whether a port exists in a datapath
without having to deal with a "dpif_port" structure as with
dpif_port_query_by_name().  A future patch will use this function.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-17 23:11:53 -07:00
+								    if (!error && dpif_port) {
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								        answer_port_query(port, dpif_port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return error;
 								}
 								static int
 								dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                               struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error;
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    error = get_port_by_name(dp, devname, &port);
-												dpif: Add new dpif_port_exists() function.

Provide the ability to determine whether a port exists in a datapath
without having to deal with a "dpif_port" structure as with
dpif_port_query_by_name().  A future patch will use this function.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-10-17 23:11:53 -07:00
+								    if (!error && dpif_port) {
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								        answer_port_query(port, dpif_port);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return error;
 								}
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								static void
 								dp_netdev_flow_free(struct dp_netdev_flow *flow)
 								{
 								    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
 								    free(flow);
 								}
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
+								static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
 								{
 								    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
 								        ovsrcu_postpone(dp_netdev_flow_free, flow);
 								    }
 								}
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								static uint32_t
 								dp_netdev_flow_hash(const ovs_u128 *ufid)
 								{
 								    return ufid->u32[0];
 								}
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								static inline struct dpcls *
 								dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
 								                           odp_port_t in_port)
 								{
 								    struct dpcls *cls;
 								    uint32_t hash = hash_port_no(in_port);
 								    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
 								        if (cls->in_port == in_port) {
 								            /* Port classifier exists already */
 								            return cls;
 								        }
 								    }
 								    return NULL;
 								}
 								static inline struct dpcls *
 								dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
 								                         odp_port_t in_port)
 								    OVS_REQUIRES(pmd->flow_mutex)
 								{
 								    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    uint32_t hash = hash_port_no(in_port);
 								    if (!cls) {
 								        /* Create new classifier for in_port */
 								        cls = xmalloc(sizeof(*cls));
 								        dpcls_init(cls);
 								        cls->in_port = in_port;
 								        cmap_insert(&pmd->classifiers, &cls->node, hash);
 								        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
 								    }
 								    return cls;
 								}
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								#define MAX_FLOW_MARK       (UINT32_MAX - 1)
 								#define INVALID_FLOW_MARK   (UINT32_MAX)
 								struct megaflow_to_mark_data {
 								    const struct cmap_node node;
 								    ovs_u128 mega_ufid;
 								    uint32_t mark;
 								};
 								struct flow_mark {
 								    struct cmap megaflow_to_mark;
 								    struct cmap mark_to_flow;
 								    struct id_pool *pool;
 								};
 								static struct flow_mark flow_mark = {
 								    .megaflow_to_mark = CMAP_INITIALIZER,
 								    .mark_to_flow = CMAP_INITIALIZER,
 								};
 								static uint32_t
 								flow_mark_alloc(void)
 								{
 								    uint32_t mark;
 								    if (!flow_mark.pool) {
 								        /* Haven't initiated yet, do it here */
 								        flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
 								    }
 								    if (id_pool_alloc_id(flow_mark.pool, &mark)) {
 								        return mark;
 								    }
 								    return INVALID_FLOW_MARK;
 								}
 								static void
 								flow_mark_free(uint32_t mark)
 								{
 								    id_pool_free_id(flow_mark.pool, mark);
 								}
 								/* associate megaflow with a mark, which is a 1:1 mapping */
 								static void
 								megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
 								{
 								    size_t hash = dp_netdev_flow_hash(mega_ufid);
 								    struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
 								    data->mega_ufid = *mega_ufid;
 								    data->mark = mark;
 								    cmap_insert(&flow_mark.megaflow_to_mark,
 								                CONST_CAST(struct cmap_node *, &data->node), hash);
 								}
 								/* disassociate meagaflow with a mark */
 								static void
 								megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
 								{
 								    size_t hash = dp_netdev_flow_hash(mega_ufid);
 								    struct megaflow_to_mark_data *data;
 								    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
 								        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
 								            cmap_remove(&flow_mark.megaflow_to_mark,
 								                        CONST_CAST(struct cmap_node *, &data->node), hash);
-												dpif-netdev: Fix cmap node use after free on flow disassociation.

Data pointed by cmap node must not be freed while iterating.
ovsrcu_postpone should be used instead.

CC: Finn Christensen <fc@napatech.com>
Fixes: e8a2b5bf92bb ("netdev-dpdk: implement flow offload with rte flow")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-10-19 16:51:13 +03:00
+								            ovsrcu_postpone(free, data);
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								            return;
 								        }
 								    }
 								    VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
 								              UUID_ARGS((struct uuid *)mega_ufid));
 								}
 								static inline uint32_t
 								megaflow_to_mark_find(const ovs_u128 *mega_ufid)
 								{
 								    size_t hash = dp_netdev_flow_hash(mega_ufid);
 								    struct megaflow_to_mark_data *data;
 								    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
 								        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
 								            return data->mark;
 								        }
 								    }
 								    VLOG_WARN("Mark id for ufid "UUID_FMT" was not found\n",
 								              UUID_ARGS((struct uuid *)mega_ufid));
 								    return INVALID_FLOW_MARK;
 								}
 								/* associate mark with a flow, which is 1:N mapping */
 								static void
 								mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
 								{
 								    dp_netdev_flow_ref(flow);
 								    cmap_insert(&flow_mark.mark_to_flow,
 								                CONST_CAST(struct cmap_node *, &flow->mark_node),
 								                hash_int(mark, 0));
 								    flow->mark = mark;
 								    VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
 								}
 								static bool
 								flow_mark_has_no_ref(uint32_t mark)
 								{
 								    struct dp_netdev_flow *flow;
 								    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
 								                             &flow_mark.mark_to_flow) {
 								        if (flow->mark == mark) {
 								            return false;
 								        }
 								    }
 								    return true;
 								}
 								static int
 								mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
 								                          struct dp_netdev_flow *flow)
 								{
 								    int ret = 0;
 								    uint32_t mark = flow->mark;
 								    struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
 								                                             &flow->mark_node);
 								    cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
 								    flow->mark = INVALID_FLOW_MARK;
 								    /*
 								     * no flow is referencing the mark any more? If so, let's
 								     * remove the flow from hardware and free the mark.
 								     */
 								    if (flow_mark_has_no_ref(mark)) {
 								        struct dp_netdev_port *port;
 								        odp_port_t in_port = flow->flow.in_port.odp_port;
 								        ovs_mutex_lock(&pmd->dp->port_mutex);
 								        port = dp_netdev_lookup_port(pmd->dp, in_port);
 								        if (port) {
 								            ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
 								        }
 								        ovs_mutex_unlock(&pmd->dp->port_mutex);
 								        flow_mark_free(mark);
 								        VLOG_DBG("Freed flow mark %u\n", mark);
 								        megaflow_to_mark_disassociate(&flow->mega_ufid);
 								    }
 								    dp_netdev_flow_unref(flow);
 								    return ret;
 								}
 								static void
 								flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
 								{
 								    struct dp_netdev_flow *flow;
 								    CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
 								        if (flow->pmd_id == pmd->core_id) {
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								            queue_netdev_flow_del(pmd, flow);
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								        }
 								    }
 								}
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								static struct dp_netdev_flow *
 								mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
 								                  const uint32_t mark)
 								{
 								    struct dp_netdev_flow *flow;
 								    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
 								                             &flow_mark.mark_to_flow) {
 								        if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
 								            flow->dead == false) {
 								            return flow;
 								        }
 								    }
 								    return NULL;
 								}
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								static struct dp_flow_offload_item *
 								dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
 								                             struct dp_netdev_flow *flow,
 								                             int op)
 								{
 								    struct dp_flow_offload_item *offload;
 								    offload = xzalloc(sizeof(*offload));
 								    offload->pmd = pmd;
 								    offload->flow = flow;
 								    offload->op = op;
 								    dp_netdev_flow_ref(flow);
 								    dp_netdev_pmd_try_ref(pmd);
 								    return offload;
 								}
 								static void
 								dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
 								{
 								    dp_netdev_pmd_unref(offload->pmd);
 								    dp_netdev_flow_unref(offload->flow);
 								    free(offload->actions);
 								    free(offload);
 								}
 								static void
 								dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
 								{
 								    ovs_mutex_lock(&dp_flow_offload.mutex);
 								    ovs_list_push_back(&dp_flow_offload.list, &offload->node);
 								    xpthread_cond_signal(&dp_flow_offload.cond);
 								    ovs_mutex_unlock(&dp_flow_offload.mutex);
 								}
 								static int
 								dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
 								{
 								    return mark_to_flow_disassociate(offload->pmd, offload->flow);
 								}
 								/*
 								 * There are two flow offload operations here: addition and modification.
 								 *
 								 * For flow addition, this function does:
 								 * - allocate a new flow mark id
 								 * - perform hardware flow offload
 								 * - associate the flow mark with flow and mega flow
 								 *
 								 * For flow modification, both flow mark and the associations are still
 								 * valid, thus only item 2 needed.
 								 */
 								static int
 								dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev_pmd_thread *pmd = offload->pmd;
 								    struct dp_netdev_flow *flow = offload->flow;
 								    odp_port_t in_port = flow->flow.in_port.odp_port;
 								    bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
 								    struct offload_info info;
 								    uint32_t mark;
 								    int ret;
 								    if (flow->dead) {
 								        return -1;
 								    }
 								    if (modification) {
 								        mark = flow->mark;
 								        ovs_assert(mark != INVALID_FLOW_MARK);
 								    } else {
 								        /*
 								         * If a mega flow has already been offloaded (from other PMD
 								         * instances), do not offload it again.
 								         */
 								        mark = megaflow_to_mark_find(&flow->mega_ufid);
 								        if (mark != INVALID_FLOW_MARK) {
 								            VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
 								            if (flow->mark != INVALID_FLOW_MARK) {
 								                ovs_assert(flow->mark == mark);
 								            } else {
 								                mark_to_flow_associate(mark, flow);
 								            }
 								            return 0;
 								        }
 								        mark = flow_mark_alloc();
 								        if (mark == INVALID_FLOW_MARK) {
 								            VLOG_ERR("Failed to allocate flow mark!\n");
 								        }
 								    }
 								    info.flow_mark = mark;
 								    ovs_mutex_lock(&pmd->dp->port_mutex);
 								    port = dp_netdev_lookup_port(pmd->dp, in_port);
 								    if (!port) {
 								        ovs_mutex_unlock(&pmd->dp->port_mutex);
 								        return -1;
 								    }
 								    ret = netdev_flow_put(port->netdev, &offload->match,
 								                          CONST_CAST(struct nlattr *, offload->actions),
 								                          offload->actions_len, &flow->mega_ufid, &info,
 								                          NULL);
 								    ovs_mutex_unlock(&pmd->dp->port_mutex);
 								    if (ret) {
 								        if (!modification) {
 								            flow_mark_free(mark);
 								        } else {
 								            mark_to_flow_disassociate(pmd, flow);
 								        }
 								        return -1;
 								    }
 								    if (!modification) {
 								        megaflow_to_mark_associate(&flow->mega_ufid, mark);
 								        mark_to_flow_associate(mark, flow);
 								    }
 								    return 0;
 								}
 								static void *
 								dp_netdev_flow_offload_main(void *data OVS_UNUSED)
 								{
 								    struct dp_flow_offload_item *offload;
 								    struct ovs_list *list;
 								    const char *op;
 								    int ret;
 								    for (;;) {
 								        ovs_mutex_lock(&dp_flow_offload.mutex);
 								        if (ovs_list_is_empty(&dp_flow_offload.list)) {
 								            ovsrcu_quiesce_start();
 								            ovs_mutex_cond_wait(&dp_flow_offload.cond,
 								                                &dp_flow_offload.mutex);
-												dpif-netdev: End the quiescent state for flow offloading thread.

Flow offloading thread uses concurrent hash maps which are
based on rcu protected variables. It must use them while in
active state. Working in a quiescent state could cause
segmentation faults because of possible cmap internal
structure changes.

Fixes: 02bb2824e51d ("dpif-netdev: do hw flow offload in a thread")
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-10-31 18:44:09 +03:00
+								            ovsrcu_quiesce_end();
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								        }
 								        list = ovs_list_pop_front(&dp_flow_offload.list);
 								        offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
 								        ovs_mutex_unlock(&dp_flow_offload.mutex);
 								        switch (offload->op) {
 								        case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
 								            op = "add";
 								            ret = dp_netdev_flow_offload_put(offload);
 								            break;
 								        case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
 								            op = "modify";
 								            ret = dp_netdev_flow_offload_put(offload);
 								            break;
 								        case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
 								            op = "delete";
 								            ret = dp_netdev_flow_offload_del(offload);
 								            break;
 								        default:
 								            OVS_NOT_REACHED();
 								        }
 								        VLOG_DBG("%s to %s netdev flow\n",
 								                 ret == 0 ? "succeed" : "failed", op);
 								        dp_netdev_free_flow_offload(offload);
 								    }
 								    return NULL;
 								}
 								static void
 								queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 								                      struct dp_netdev_flow *flow)
 								{
 								    struct dp_flow_offload_item *offload;
 								    if (ovsthread_once_start(&offload_thread_once)) {
 								        xpthread_cond_init(&dp_flow_offload.cond, NULL);
 								        ovs_thread_create("dp_netdev_flow_offload",
 								                          dp_netdev_flow_offload_main, NULL);
 								        ovsthread_once_done(&offload_thread_once);
 								    }
 								    offload = dp_netdev_alloc_flow_offload(pmd, flow,
 								                                           DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
 								    dp_netdev_append_flow_offload(offload);
 								}
 								static void
 								queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
 								                      struct dp_netdev_flow *flow, struct match *match,
 								                      const struct nlattr *actions, size_t actions_len)
 								{
 								    struct dp_flow_offload_item *offload;
 								    int op;
 								    if (!netdev_is_flow_api_enabled()) {
 								        return;
 								    }
 								    if (ovsthread_once_start(&offload_thread_once)) {
 								        xpthread_cond_init(&dp_flow_offload.cond, NULL);
 								        ovs_thread_create("dp_netdev_flow_offload",
 								                          dp_netdev_flow_offload_main, NULL);
 								        ovsthread_once_done(&offload_thread_once);
 								    }
 								    if (flow->mark != INVALID_FLOW_MARK) {
 								        op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
 								    } else {
 								        op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
 								    }
 								    offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
 								    offload->match = *match;
 								    offload->actions = xmalloc(actions_len);
 								    memcpy(offload->actions, actions, actions_len);
 								    offload->actions_len = actions_len;
 								    dp_netdev_append_flow_offload(offload);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
 								                          struct dp_netdev_flow *flow)
 								    OVS_REQUIRES(pmd->flow_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Use cmap instead of hmap.

This requires less locking and makes introducing lockless classifier
lookups possible.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-07-04 06:38:47 -07:00
+								    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
 								    odp_port_t in_port = flow->flow.in_port.odp_port;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    ovs_assert(cls != NULL);
 								    dpcls_remove(cls, &flow->cr);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    if (flow->mark != INVALID_FLOW_MARK) {
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								        queue_netdev_flow_del(pmd, flow);
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    }
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    flow->dead = true;
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
 								    dp_netdev_flow_unref(flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												cmap, classifier: Avoid unsafe aliasing in iterators.

CMAP_FOR_EACH and CLS_FOR_EACH and their variants tried to use void ** as
a "pointer to any kind of pointer".  That is a violation of the aliasing
rules in ISO C which technically yields undefined behavior.  With GCC 4.1,
it causes both warnings and actual misbehavior.  One option would to add
-fno-strict-aliasing to the compiler flags, but that would only help with
GCC; who knows whether this can be worked around with other compilers.

Instead, this commit rewrites the iterators to avoid disallowed pointer
aliasing.

VMware-BZ: #1287651
Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-07-21 21:00:04 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_lock(&pmd->flow_mutex);
 								    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
 								        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
 								dpif_netdev_flow_flush(struct dpif *dpif)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        dp_netdev_pmd_flow_flush(pmd);
 								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return 0;
 								}
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								struct dp_netdev_port_state {
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_position position;
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								    char *name;
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								};
 								static int
 								dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
 								{
 								    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
 								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								                           struct dpif_port *dpif_port)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    struct dp_netdev_port_state *state = state_;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct hmap_node *node;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    int retval;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
 								    node = hmap_at_position(&dp->ports, &state->position);
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    if (node) {
 								        struct dp_netdev_port *port;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								        port = CONTAINER_OF(node, struct dp_netdev_port, node);
 								        free(state->name);
 								        state->name = xstrdup(netdev_get_name(port->netdev));
 								        dpif_port->name = state->name;
 								        dpif_port->type = port->type;
-												tunnels: Don't initialize unnecessary packet metadata.

The addition of Geneve options to packet metadata significantly
expanded its size. It was reported that this can decrease performance
for DPDK ports by up to 25% since we need to initialize the whole
structure on each packet receive.

It is not really necessary to zero out the entire structure because
miniflow_extract() only copies the tunnel metadata when particular
fields indicate that it is valid. Therefore, as long as we zero out
these fields when the metadata is initialized and ensure that the
rest of the structure is correctly set in the presence of a tunnel,
we can avoid touching the tunnel fields on packet reception.

Reported-by: Ciara Loftus <ciara.loftus@intel.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-30 19:19:40 -07:00
+								        dpif_port->port_no = port->port_no;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
 								        retval = 0;
 								    } else {
 								        retval = EOF;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    return retval;
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								}
 								static int
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								{
-												dpif: Eliminate "struct odp_port" from client-visible interface.

Following this commit, "struct odp_port" is only used in Linux-specific
parts of OVS userspace code.  This allows the actual Linux datapath
interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-23 18:48:02 -08:00
+								    struct dp_netdev_port_state *state = state_;
 								    free(state->name);
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    free(state);
 								    return 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												Rename UNUSED macro to OVS_UNUSED to avoid naming conflict.

Requested by Jean Tourrilhes <jt@hpl.hp.com>.

											
										
										
											2010-02-11 10:59:47 -08:00
+								dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    uint64_t new_port_seq;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    int error;
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    new_port_seq = seq_read(dpif->dp->port_seq);
 								    if (dpif->last_port_seq != new_port_seq) {
 								        dpif->last_port_seq = new_port_seq;
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = ENOBUFS;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = EAGAIN;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static void
 								dpif_netdev_port_poll_wait(const struct dpif *dpif_)
 								{
 								    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								}
 								static struct dp_netdev_flow *
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								dp_netdev_flow_cast(const struct dpcls_rule *cr)
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								{
 								    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
 								{
 								    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
 								}
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								/* netdev_flow_key utilities.
 								 *
 								 * netdev_flow_key is basically a miniflow.  We use these functions
 								 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
 								 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
 								 *
 								 * - Since we are dealing exclusively with miniflows created by
 								 *   miniflow_extract(), if the map is different the miniflow is different.
 								 *   Therefore we can be faster by comparing the map and the miniflow in a
 								 *   single memcmp().
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								 * - These functions can be inlined by the compiler. */
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Given the number of bits set in miniflow's maps, returns the size of the
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								 * 'netdev_flow_key.mf' */
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								static inline size_t
 								netdev_flow_key_size(size_t flow_u64s)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								    return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
 								static inline bool
 								netdev_flow_key_equal(const struct netdev_flow_key *a,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                      const struct netdev_flow_key *b)
 								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    /* 'b->len' may be not set yet. */
 								    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
-												dpif-netdev: Fix typo in comment.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2016-03-03 20:43:20 -08:00
+								 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * generated by miniflow_extract. */
 								static inline bool
 								netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
 								                         const struct miniflow *mf)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    return !memcmp(&key->mf, mf, key->len);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
 								static inline void
 								netdev_flow_key_clone(struct netdev_flow_key *dst,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                      const struct netdev_flow_key *src)
 								{
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    memcpy(dst, src,
 								           offsetof(struct netdev_flow_key, mf) + src->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								/* Initialize a netdev_flow_key 'mask' from 'match'. */
 								static inline void
 								netdev_flow_mask_init(struct netdev_flow_key *mask,
 								                      const struct match *match)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    uint64_t *dst = miniflow_values(&mask->mf);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    struct flowmap fmap;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    size_t idx;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    /* Only check masks that make sense for the flow. */
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    flow_wc_map(&match->flow, &fmap);
 								    flowmap_init(&mask->mf.map);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
 								        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								        if (mask_u64) {
 								            flowmap_set(&mask->mf.map, idx, 1);
 								            *dst++ = mask_u64;
 								            hash = hash_add64(hash, mask_u64);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        }
 								    }
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    map_t map;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
 								        hash = hash_add64(hash, map);
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    size_t n = dst - miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								    mask->hash = hash_finish(hash, n * 8);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    mask->len = netdev_flow_key_size(n);
 								}
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static inline void
 								netdev_flow_key_init_masked(struct netdev_flow_key *dst,
 								                            const struct flow *flow,
 								                            const struct netdev_flow_key *mask)
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    uint64_t *dst_u64 = miniflow_values(&dst->mf);
 								    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    dst->len = mask->len;
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								    dst->mf = mask->mf;   /* Copy maps. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
-												miniflow: Use 64-bit data.

So far the compressed flow data in struct miniflow has been in 32-bit
words with a 63-bit map, allowing for a maximum size of struct flow of
252 bytes.  With the forthcoming Geneve options this is not sufficient
any more.

This patch solves the problem by changing the miniflow data to 64-bit
words, doubling the flow max size to 504 bytes.  Since the word size
is doubled, there is some loss in compression efficiency.  To counter
this some of the flow fields have been reordered to keep related
fields together (e.g., the source and destination IP addresses share
the same 64-bit word).

This change should speed up flow data processing on 64-bit CPUs, which
may help counterbalance the impact of making the struct flow bigger in
the future.

Classifier lookup stage boundaries are also changed to 64-bit
alignment, as the current algorithm depends on each miniflow word to
not be split between ranges.  This has resulted in new padding (part
of the 'mpls_lse' field).

The 'dp_hash' field is also moved to packet metadata to eliminate
otherwise needed padding there.  This allows the L4 to fit into one
64-bit word, and also makes matches on 'dp_hash' more efficient as
misses can be found already on stage 1.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-01-06 11:10:42 -08:00
+								        *dst_u64 = value & *mask_u64++;
 								        hash = hash_add64(hash, *dst_u64++);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    dst->hash = hash_finish(hash,
 								                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
 								#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
 								    MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Returns a hash value for the bits of 'key' where there are 1-bits in
 								 * 'mask'. */
 								static inline uint32_t
 								netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
 								                             const struct netdev_flow_key *mask)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    const uint64_t *p = miniflow_get_values(&mask->mf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    uint32_t hash = 0;
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
 								        hash = hash_add64(hash, value & *p++);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
-												dpif-netdev: Introduce netdev_flow_key_* functions

netdev_flow_key is a miniflow with the following constraints:

1) It is used only inside dpif-netdev.c.
2) It always has inline values.
3) It contains only miniflows created by miniflow_extract().

Therefore, by using these new functions instead of the miniflow_*
ones, we get the following (performance related) benefits:

- Because of (1) the functions can be inlined.
- Because of (2) and (3) the netdev_flow_key can be treated as POD.
  Specifically, because of (3), we can do comparisons with memcmp,
  since if the map is different the miniflow must be different.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
											
										
										
											2014-09-06 08:10:42 +00:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline bool
 								emc_entry_alive(struct emc_entry *ce)
 								{
 								    return ce->flow && !ce->flow->dead;
 								}
 								static void
 								emc_clear_entry(struct emc_entry *ce)
 								{
 								    if (ce->flow) {
 								        dp_netdev_flow_unref(ce->flow);
 								        ce->flow = NULL;
 								    }
 								}
 								static inline void
 								emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                 const struct netdev_flow_key *key)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
 								    if (ce->flow != flow) {
 								        if (ce->flow) {
 								            dp_netdev_flow_unref(ce->flow);
 								        }
 								        if (dp_netdev_flow_ref(flow)) {
 								            ce->flow = flow;
 								        } else {
 								            ce->flow = NULL;
 								        }
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    if (key) {
 								        netdev_flow_key_clone(&ce->key, key);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
 								static inline void
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								           struct dp_netdev_flow *flow)
 								{
 								    struct emc_entry *to_be_replaced = NULL;
 								    struct emc_entry *current_entry;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
 								        if (netdev_flow_key_equal(&current_entry->key, key)) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            /* We found the entry with the 'mf' miniflow */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            emc_change_entry(current_entry, flow, NULL);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            return;
 								        }
 								        /* Replacement policy: put the flow in an empty (not alive) entry, or
 								         * in the first entry where it can be */
 								        if (!to_be_replaced
 								            || (emc_entry_alive(to_be_replaced)
 								                && !emc_entry_alive(current_entry))
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            || current_entry->key.hash < to_be_replaced->key.hash) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            to_be_replaced = current_entry;
 								        }
 								    }
 								    /* We didn't find the miniflow in the cache.
 								     * The 'to_be_replaced' entry is where the new flow will be stored */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    emc_change_entry(to_be_replaced, flow, key);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								static inline void
 								emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
 								                         const struct netdev_flow_key *key,
 								                         struct dp_netdev_flow *flow)
 								{
 								    /* Insert an entry into the EMC based on probability value 'min'. By
 								     * default the value is UINT32_MAX / 100 which yields an insertion
 								     * probability of 1/100 ie. 1% */
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    uint32_t min = pmd->ctx.emc_insert_min;
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
-												dpif-netdev: Fix insertion probability

emc_conditional_insert uses pmd->last_cycles and the packet's RSS hash
to generate a random number used to determine whether or not an emc
entry should be inserted. This works for single-packet bursts as
last_cycles is updated for each burst. However, for bursts > 1 packet,
where the packets in the batch generate the same RSS hash,
pmd->last_cycles remains constant for the entire burst also, and thus
cannot be used as a random number for each packet in the burst.

This commit replaces the use of pmd->last_cycles with random_uint32()
for this purpose and subsequently fixes the behavior of the
emc_insert_inv_prob setting for high-throughput (large bursts)
single-flow cases.

Fixes: 4c30b24602c3 ("dpif-netdev: Conditional EMC insert")
Reported-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-23 16:31:03 +01:00
+								    if (min && random_uint32() <= min) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								    }
 								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline struct dp_netdev_flow *
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
 								    struct emc_entry *current_entry;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
 								        if (current_entry->key.hash == key->hash
 								            && emc_entry_alive(current_entry)
 								            && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            /* We found the entry with the 'key->mf' miniflow */
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								            return current_entry->flow;
 								        }
 								    }
 								    return NULL;
 								}
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static inline const struct cmap_node *
 								smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
 								{
 								    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
 								    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
 								    uint16_t sig = hash >> 16;
 								    uint16_t index = UINT16_MAX;
 								    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
 								        if (bucket->sig[i] == sig) {
 								            index = bucket->flow_idx[i];
 								            break;
 								        }
 								    }
 								    if (index != UINT16_MAX) {
 								        return cmap_find_by_index(&pmd->flow_table, index);
 								    }
 								    return NULL;
 								}
 								static void
 								smc_clear_entry(struct smc_bucket *b, int idx)
 								{
 								    b->flow_idx[idx] = UINT16_MAX;
 								}
 								/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
 								 * turned off, 2) the flow_table index is larger than uint16_t can handle.
 								 * If there is already an SMC entry having same signature, the index will be
 								 * updated. If there is no existing entry, but an empty entry is available,
 								 * the empty entry will be taken. If no empty entry or existing same signature,
 								 * a random entry from the hashed bucket will be picked. */
 								static inline void
 								smc_insert(struct dp_netdev_pmd_thread *pmd,
 								           const struct netdev_flow_key *key,
 								           uint32_t hash)
 								{
 								    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
 								    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
 								    uint16_t index;
 								    uint32_t cmap_index;
 								    bool smc_enable_db;
 								    int i;
 								    atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
 								    if (!smc_enable_db) {
 								        return;
 								    }
 								    cmap_index = cmap_find_index(&pmd->flow_table, hash);
 								    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
 								    /* If the index is larger than SMC can handle (uint16_t), we don't
 								     * insert */
 								    if (index == UINT16_MAX) {
 								        return;
 								    }
 								    /* If an entry with same signature already exists, update the index */
 								    uint16_t sig = key->hash >> 16;
 								    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
 								        if (bucket->sig[i] == sig) {
 								            bucket->flow_idx[i] = index;
 								            return;
 								        }
 								    }
 								    /* If there is an empty entry, occupy it. */
 								    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
 								        if (bucket->flow_idx[i] == UINT16_MAX) {
 								            bucket->sig[i] = sig;
 								            bucket->flow_idx[i] = index;
 								            return;
 								        }
 								    }
 								    /* Otherwise, pick a random entry. */
 								    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
 								    bucket->sig[i] = sig;
 								    bucket->flow_idx[i] = index;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static struct dp_netdev_flow *
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
 								                          const struct netdev_flow_key *key,
 								                          int *lookup_num_p)
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dpcls_rule *rule;
-												flow: Improve type-safety of MINIFLOW_GET_TYPE.

Until mow, this macro has blindly read the passed-in type's size, but
that's unnecessarily risky.  This commit changes it to verify that the
passed-in type is the same size as the field and, on GCC and Clang, that
the types are compatible.  It also adds a version that does not check,
for the one case where (currently) we deliberately read the wrong size,
and updates a few uses to use more precise field names.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Reviewed-by: Yifeng Sun <pkusunyifeng@gmail.com>
Reviewed-by: Armando Migliaccio <armamig@gmail.com>

											
										
										
											2018-03-19 21:34:26 -07:00
+								    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
 								                                                     in_port.odp_port));
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dp_netdev_flow *netdev_flow = NULL;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    if (OVS_LIKELY(cls)) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        netdev_flow = dp_netdev_flow_cast(rule);
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    return netdev_flow;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								}
 								static struct dp_netdev_flow *
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
 								                        const ovs_u128 *ufidp, const struct nlattr *key,
 								                        size_t key_len)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    struct flow flow;
 								    ovs_u128 ufid;
 								    /* If a UFID is not provided, determine one based on the key. */
 								    if (!ufidp && key && key_len
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								        ufidp = &ufid;
 								    }
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    if (ufidp) {
 								        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								                                 &pmd->flow_table) {
-												util: Pass 128-bit arguments directly instead of using pointers.

Commit f2d105b5 (ofproto-dpif-xlate: xlate ct_{mark, label} correctly.)
introduced the ovs_u128_and() function.  It directly takes ovs_u128
values as arguments instead of pointers to them.  As this is a bit more
direct way to deal with 128-bit values, modify the other utility
functions to do the same.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-03 18:20:51 -07:00
+								            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								                return netdev_flow;
 								            }
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    return NULL;
 								}
 								static void
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								                    struct dpif_flow_stats *stats)
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
+								{
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    struct dp_netdev_flow *netdev_flow;
 								    unsigned long long n;
 								    long long used;
 								    uint16_t flags;
 								    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
 								    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
 								    stats->n_packets = n;
 								    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
 								    stats->n_bytes = n;
 								    atomic_read_relaxed(&netdev_flow->stats.used, &used);
 								    stats->used = used;
 								    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
 								    stats->tcp_flags = flags;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
 								 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
 								 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
 								 * protect them. */
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								static void
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								                            struct dpif_flow *flow, bool terse)
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								{
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								    if (terse) {
 								        memset(flow, 0, sizeof *flow);
 								    } else {
 								        struct flow_wildcards wc;
 								        struct dp_netdev_actions *actions;
 								        size_t offset;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        struct odp_flow_key_parms odp_parms = {
 								            .flow = &netdev_flow->flow,
 								            .mask = &wc.masks,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								            .support = dp_netdev_support,
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        };
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								        /* in_port is exact matched, but we have left it out from the mask for
 								         * optimnization reasons. Add in_port back to the mask. */
 								        wc.masks.in_port.odp_port = ODPP_NONE;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Key */
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        offset = key_buf->size;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        flow->key = ofpbuf_tail(key_buf);
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_flow(&odp_parms, key_buf);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        flow->key_len = key_buf->size - offset;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Mask */
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        offset = mask_buf->size;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        flow->mask = ofpbuf_tail(mask_buf);
-												odp-util: Pass down flow netlink attributes when translating masks.

Sometimes we need to look at flow fields to understand how to parse
an attribute. However, masks don't have this information - just the
mask on the field. We already use the translated flow structure for
this purpose but this isn't always enough since sometimes we actually
need the raw netlink information. Fortunately, that is also readily
available so this passes it down from the appropriate callers.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-06-19 13:54:13 -07:00
+								        odp_parms.key_buf = key_buf;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_mask(&odp_parms, mask_buf);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        flow->mask_len = mask_buf->size - offset;
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
 								        /* Actions */
 								        actions = dp_netdev_flow_get_actions(netdev_flow);
 								        flow->actions = actions->actions;
 								        flow->actions_len = actions->size;
 								    }
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    flow->ufid = netdev_flow->ufid;
 								    flow->ufid_present = true;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    flow->pmd_id = netdev_flow->pmd_id;
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								    get_dpif_flow_stats(netdev_flow, &flow->stats);
-												dpif-netdev: Initialize dpif_flow attrs

In a previous commit, the dpif_flow struct was expanded, with the
'offloaded' field being moved into a new struct which also includes a
field for the dp layer the flow is handled on. The initialization of
these fields was only done in dpif-netlink.

This completes that commit, by initializing the fields in dpif-netdev
as well. As the 'offloaded' field was previously ignored by
dpif-netdev, the attrs are initialized to the default values of
'false' for the offloaded state, and 'ovs' for the dp layer.

Fixes: d63ca5329ff9 ("dpctl: Properly reflect a rule's offloaded to HW state")
Signed-off-by: Gavi Teitz <gavi@mellanox.com>
Acked-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2018-08-10 11:30:07 +03:00
 								    flow->attrs.offloaded = false;
 								    flow->attrs.dp_layer = "ovs";
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								}
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								static int
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
 								                              const struct nlattr *mask_key,
 								                              uint32_t mask_key_len, const struct flow *flow,
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								                              struct flow_wildcards *wc, bool probe)
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								{
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
+								    enum odp_key_fitness fitness;
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
+								    if (fitness) {
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								        if (!probe) {
 								            /* This should not happen: it indicates that
 								             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
 								             * disagree on the acceptable form of a mask.  Log the problem
 								             * as an error, with enough details to enable debugging. */
 								            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 								            if (!VLOG_DROP_ERR(&rl)) {
 								                struct ds s;
 								                ds_init(&s);
 								                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
 								                                true);
 								                VLOG_ERR("internal error parsing flow mask %s (%s)",
 								                ds_cstr(&s), odp_key_fitness_to_string(fitness));
 								                ds_destroy(&s);
 								            }
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								        }
-												odp-util: Return exact mask if netlink mask attribute is missing.

In the ODP context an empty mask netlink attribute usually means that
the flow should be an exact match.

odp_flow_key_to_mask{,_udpif}() instead return a struct flow_wildcards
with matches only on recirc_id and vlan_tci.

A more appropriate behavior is to handle a missing (zero length) netlink
mask specially (like we do in userspace and Linux datapath) and create
an exact match flow_wildcards from the original flow.

This fixes a bug in revalidate_ukey(): every flow created with
megaflows disabled would be revalidated away, because the mask would
seem too generic. (Another possible fix would be to handle the special
case of a missing mask in revalidate_ukey(), but this seems a more
generic solution).

											
										
										
											2015-12-07 17:30:25 -08:00
 								        return EINVAL;
-												dpif-netdev: Properly create exact match masks.

Normally OVS userspace supplies a mask along with a flow key for each
new data path flow that should be created.  OVS also provides an
option to disable the kernel wildcarding, in which case the flows are
created without a mask.  When kernel wildcarding is disabled, the
datapath should use exact match, i.e. not wildcard any bits in the
flow key.  Currently, what happens with the userspace datapath instead
is that a datapath flow with mostly empty mask is created (i.e., most
fields are wildcarded), as the current code does not examine the given
mask key length to find out that the mask key is actually empty.  This
results in the same datapath flow matching on packets of multiple
different flows, wrong actions being processed, and stats being
incorrect.

This patch refactors userspace datapath code to explicitly initialize
a suitable exact match mask when a flow put without a mask is
executed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-11 11:07:01 -08:00
+								    }
 								    return 0;
 								}
 								static int
 								dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								                              struct flow *flow, bool probe)
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								{
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    if (odp_flow_key_to_flow(key, key_len, flow)) {
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								        if (!probe) {
 								            /* This should not happen: it indicates that
 								             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
 								             * the acceptable form of a flow.  Log the problem as an error,
 								             * with enough details to enable debugging. */
 								            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 								            if (!VLOG_DROP_ERR(&rl)) {
 								                struct ds s;
 								                ds_init(&s);
 								                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
 								                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
 								                ds_destroy(&s);
 								            }
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								        }
 								        return EINVAL;
 								    }
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
+								        return EINVAL;
 								    }
-												datapath: Convert odp_flow_key to use Netlink attributes instead.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This
commit makes that change using Netlink attribute sequences.

This commit does not actually make userspace flexible enough to handle
changes in the kernel flow key structure, because userspace doesn't yet
have enough information to do that intelligently.  Upcoming commits will
fix that.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-23 18:44:44 -08:00
+								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static int
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
 								    struct hmapx_node *node;
 								    int error = EINVAL;
 								    if (get->pmd_id == PMD_ID_NULL) {
 								        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
 								                dp_netdev_pmd_unref(pmd);
 								            }
 								        }
 								    } else {
 								        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
 								        if (!pmd) {
 								            goto out;
 								        }
 								        hmapx_add(&to_find, pmd);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    }
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    if (!hmapx_count(&to_find)) {
 								        goto out;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    HMAPX_FOR_EACH (node, &to_find) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
 								                                              get->key_len);
 								        if (netdev_flow) {
 								            dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
 								                                        get->flow, false);
 								            error = 0;
 								            break;
 								        } else {
 								            error = ENOENT;
 								        }
 								    }
-												datapath: Change ODP_FLOW_GET to retrieve only a single flow at a time.

This brings the code closer to what the Netlink interface will need to
implement.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-17 14:40:58 -08:00
-												dpctl: Implement dpctl/flow-get for dpif-netdev.

Currently 'dpctl/flow-get' doesn't work for flows installed by
PMD threads.

Fix that by implementing search across all PMD threads. Will be returned
flow from first PMD thread with match.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-27 16:32:50 +03:00
+								    HMAPX_FOR_EACH (node, &to_find) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        dp_netdev_pmd_unref(pmd);
 								    }
 								out:
 								    hmapx_destroy(&to_find);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								static void
 								dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
 								{
 								    struct flow masked_flow;
 								    size_t i;
 								    for (i = 0; i < sizeof(struct flow); i++) {
 								        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
 								                                       ((uint8_t *)&match->wc)[i];
 								    }
 								    dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
 								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static struct dp_netdev_flow *
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
 								                   struct match *match, const ovs_u128 *ufid,
-												dpif-netdev: Avoid useless flow copy in dp_netdev_flow_add().

This patch gives dp_netdev_flow_add() a match with which it can
initialize the classifier rule.  This prevents it from needing to copy
a flow and flow_wildcards into the match first.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-27 18:56:45 -07:00
+								                   const struct nlattr *actions, size_t actions_len)
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    OVS_REQUIRES(pmd->flow_mutex)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dp_netdev_flow *flow;
 								    struct netdev_flow_key mask;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
 								    /* Make sure in_port is exact matched before we read it. */
 								    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    odp_port_t in_port = match->flow.in_port.odp_port;
-												dpif-netdev: Reintroduce ref_cnt for dp_netdev_flow

struct dp_netdev_flow used to have a reference counter.
It has been replaced by RCU. Unfortunately RCU is not
enough if we plan to hold a reference to the dp_netdev_flow
for a long time. So this commit reintroduces reference
counting for struct dp_netdev_flow

Subsequent commits make use of it.

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-11 17:25:50 -07:00
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    /* As we select the dpcls based on the port number, each netdev flow
 								     * belonging to the same dpcls will have the same odp_port value.
 								     * For performance reasons we wildcard odp_port here in the mask.  In the
 								     * typical case dp_hash is also wildcarded, and the resulting 8-byte
 								     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
 								     * will not be part of the subtable mask.
 								     * This will speed up the hash computation during dpcls_lookup() because
 								     * there is one less call to hash_add64() in this case. */
 								    match->wc.masks.in_port.odp_port = 0;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_mask_init(&mask, match);
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    match->wc.masks.in_port.odp_port = ODPP_NONE;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Make sure wc does not have metadata. */
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
 								               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
-												dpif-netdev: Use ovsthread_stats for flow stats.

This should scale better than a single mutex, though still not
ideally.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-01-22 16:03:10 -08:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    /* Do not allocate extra space. */
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    memset(&flow->stats, 0, sizeof flow->stats);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    flow->dead = false;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    flow->batch = NULL;
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    flow->mark = INVALID_FLOW_MARK;
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    ovs_refcount_init(&flow->ref_cnt);
 								    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
-												dpcls: Avoid one 8-byte chunk in subtable mask.

This patch allows to skip the 8-byte chunk comprising of dp_hash and
in_port in the subtable mask when dp_hash is wildcarded.  This will
slightly speed up the hash computation as one expensive function call
to hash_add64() can be skipped.

For each new netdev flow we wildcard in_port in the mask, so in the
typical case where dp_hash is also wildcarded, the resulting 8-byte
chunk will not be part of the subtable mask.

This manipulation of the mask is possible as the datapath classifier
is explicitly selected based on the in_port value, so that all the
datapath flows in the selected classifier have an exact match on that
in_port value.  Given this, it is safe to ignore the in_port value
when doing a lookup in the chosen classifier.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Co-authored-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2017-01-05 15:33:13 -08:00
+								    /* Select dpcls for in_port. Relies on in_port to be exact match. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
 								    dpcls_insert(cls, &flow->cr, &mask);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Fix rare flow add race condition.

Before this patch, dp_netdev_flow_add() inserted newly minted flows in
the "flow_table" cmap before inserting them into the per core "dpcls"
classifier.  Since dpcls_insert() initializes 'flow->cr.mask', there's
a brief window where the flow is accessible from the cmap, but has a
bogus mask value.

In my testing, under rare instances (i.e. once every 20 minutes with a
very specific flow table and traffic pattern), revalidators core dump
when they call dpif_netdev_flow_dump_next(), which accesses this bogus
mask value from dp_netdev_flow_to_dpif_flow().

By inserting into the per core classifier before the cmap, all the
values are guaranteed to be initialized during flow dumps.  With this
patch, I can no longer reproduce the crash.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-01-03 11:39:14 -08:00
+								    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
 								                dp_netdev_flow_hash(&flow->ufid));
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								    queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								    if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        struct ds ds = DS_EMPTY_INITIALIZER;
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        struct ofpbuf key_buf, mask_buf;
 								        struct odp_flow_key_parms odp_parms = {
 								            .flow = &match->flow,
 								            .mask = &match->wc.masks,
 								            .support = dp_netdev_support,
 								        };
 								        ofpbuf_init(&key_buf, 0);
 								        ofpbuf_init(&mask_buf, 0);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        odp_flow_key_from_flow(&odp_parms, &key_buf);
 								        odp_parms.key_buf = &key_buf;
 								        odp_flow_key_from_mask(&odp_parms, &mask_buf);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_put_cstr(&ds, "flow_add: ");
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								        odp_format_ufid(ufid, &ds);
 								        ds_put_cstr(&ds, " ");
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        odp_flow_format(key_buf.data, key_buf.size,
 								                        mask_buf.data, mask_buf.size,
 								                        NULL, &ds, false);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_put_cstr(&ds, ", actions:");
-												odp-util: Use port names in output in more places.

Until now, ODP output only showed port names for in_port matches.  This
commit shows them in other places port numbers appear.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>

											
										
										
											2017-06-18 09:51:57 +08:00
+								        format_odp_actions(&ds, actions, actions_len, NULL);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								        VLOG_DBG("%s", ds_cstr(&ds));
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												dpif-netdev: Print installed flows in dpif format.

When debug logging is enabled, dpif-netdev can print each flow as it is
installed, which it currently does using OpenFlow match formatting. Compared
to ODP formatting, there generally isn't too much difference since the
fields are largely the same but it is inconsistent with other logging in
dpif-netdev as well as the analogous functions that deal with the kernel.

However, in some cases there is a difference between the two formats, such
as in the cases of input port or tunnel metadata. For input port, datapath
format helped detect that the generated masks were incorrect. As for tunnels,
at the moment, it's possible to convert between the two formats on demand as
we have a global metadata table. In the future, though this won't be possible
as the metadata table becomes per-bridge which the datapath won't have access
to.

Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-05-28 09:56:07 -07:00
+								        ofpbuf_uninit(&key_buf);
 								        ofpbuf_uninit(&mask_buf);
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
 								        /* Add a printout of the actual match installed. */
 								        struct match m;
 								        ds_clear(&ds);
 								        ds_put_cstr(&ds, "flow match: ");
 								        miniflow_expand(&flow->cr.flow.mf, &m.flow);
 								        miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
-												dpif-netdev: Initialize 'tun_md' member of match.

Found by valgrind.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-07-04 16:59:13 -07:00
+								        memset(&m.tun_md, 0, sizeof m.tun_md);
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								        match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
 								        VLOG_DBG("%s", ds_cstr(&ds));
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_destroy(&ds);
 								    }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    return flow;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
 								                struct netdev_flow_key *key,
 								                struct match *match,
 								                ovs_u128 *ufid,
 								                const struct dpif_flow_put *put,
 								                struct dpif_flow_stats *stats)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    struct dp_netdev_flow *netdev_flow;
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    int error = 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (stats) {
 								        memset(stats, 0, sizeof *stats);
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_lock(&pmd->flow_mutex);
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    if (!netdev_flow) {
-												dpif: Change provider interface to consistently use operation structs.

Until now, a "flow put" has represented its parameters in two different
ways, depending on whether it was coming from dpif_flow_put() or from
dpif_operate(), and similarly for an "execute" operation.  This commit
adopts the operation struct consistently within the dpif provider
interface, which seems cleaner.

This commit also factors out logging for flow puts and executes, which
is useful in the following commit.

This doesn't change the dpif client interface, since the two forms are
more convenient for clients than always filling out an operation struct.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2011-12-26 14:39:03 -08:00
+								        if (put->flags & DPIF_FP_CREATE) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								            if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                dp_netdev_flow_add(pmd, match, ufid, put->actions,
-												dpif: Index flows using unique identifiers.

This patch modifies the dpif interface to allow flows to be manipulated
using a 128-bit identifier. This allows revalidator threads to perform
datapath operations faster, as they do not need to serialise the entire
flow key for operations like flow_get and flow_delete. In conjunction
with a future patch to simplify the dump interface, this provides a
significant performance benefit for revalidation.

When handlers assemble flow_put operations, they specify a unique
identifier (UFID) for each flow as it is passed down to the datapath to
be stored with the flow. The UFID is currently provided to handlers
by the dpif during upcall processing.

When revalidators assemble flow_get or flow_del operations, they may
specify the UFID for the flow along with the key. The dpif will decide
whether to send only the UFID to the datapath, or both the UFID and flow
key. The former is preferred for newer datapaths that support UFID,
while the latter is used for backwards compatibility.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 16:26:35 +12:00
+								                                   put->actions_len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                error = 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								                error = EFBIG;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            }
 								        } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								            error = ENOENT;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    } else {
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								        if (put->flags & DPIF_FP_MODIFY) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								            struct dp_netdev_actions *new_actions;
 								            struct dp_netdev_actions *old_actions;
 								            new_actions = dp_netdev_actions_create(put->actions,
 								                                                   put->actions_len);
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								            old_actions = dp_netdev_flow_get_actions(netdev_flow);
 								            ovsrcu_set(&netdev_flow->actions, new_actions);
-												dpif-netdev: Use ovsthread_stats for flow stats.

This should scale better than a single mutex, though still not
ideally.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-01-22 16:03:10 -08:00
-												dpif-netdev: do hw flow offload in a thread

Currently, the major trigger for hw flow offload is at upcall handling,
which is actually in the datapath. Moreover, the hw offload installation
and modification is not that lightweight. Meaning, if there are so many
flows being added or modified frequently, it could stall the datapath,
which could result to packet loss.

To diminish that, all those flow operations will be recorded and appended
to a list. A thread is then introduced to process this list (to do the
real flow offloading put/del operations). This could leave the datapath
as lightweight as possible.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:08 +03:00
+								            queue_netdev_flow_put(pmd, netdev_flow, match,
 								                                  put->actions, put->actions_len);
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            if (stats) {
 								                get_dpif_flow_stats(netdev_flow, stats);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								            }
 								            if (put->flags & DPIF_FP_ZERO_STATS) {
-												dpif-netdev: Remove support for DPIF_FP_ZERO_STATS flag

Since flow statistics are thread local and updated without any lock,
it is not correct to do a memset from another thread.

This commit simply removes the support for the flag.  It is not needed
by ofproto-dpif, it is only exposed by dpctl commands.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-01 17:20:57 +01:00
+								                /* XXX: The userspace datapath uses thread local statistics
 								                 * (for flows), which should be updated only by the owning
 								                 * thread.  Since we cannot write on stats memory here,
 								                 * we choose not to support this flag.  Please note:
 								                 * - This feature is currently used only by dpctl commands with
 								                 *   option --clear.
 								                 * - Should the need arise, this operation can be implemented
 								                 *   by keeping a base value (to be update here) for each
 								                 *   counter, and subtracting it before outputting the stats */
 								                error = EOPNOTSUPP;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								            }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								        } else if (put->flags & DPIF_FP_CREATE) {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								            error = EEXIST;
-												dpif-netdev: Introduce a classifier in userspace datapath.

Instead of an exact match flow table, we introduce a classifier.
This enables mega-flows in userspace datapath.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[blp@nicira.com tweaked flow lookup]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-04 06:23:54 -08:00
+								        } else {
 								            /* Overlapping flow. */
 								            error = EINVAL;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        }
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								    struct netdev_flow_key key, mask;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct match match;
 								    ovs_u128 ufid;
 								    int error;
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								    bool probe = put->flags & DPIF_FP_PROBE;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (put->stats) {
 								        memset(put->stats, 0, sizeof *put->stats);
 								    }
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
 								                                          probe);
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (error) {
 								        return error;
 								    }
 								    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
 								                                          put->mask, put->mask_len,
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								                                          &match.flow, &match.wc, probe);
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (error) {
 								        return error;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    }
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (put->ufid) {
 								        ufid = *put->ufid;
 								    } else {
 								        dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
 								    }
-												dpif-netdev: Add vlan to mask for flow_put operation.

Datapath flows in dpif-netdev classifier always has exact match
mask set for vlan. We have to enable it for flow_put operation
too in order to avoid flow modification failure due to
classifier lookup with wrong hash.

Found by OFtest.

CC: Jan Scheurich <jan.scheurich@ericsson.com>
Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline")
Reported-by: Ben Pfaff <blp@ovn.org>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2018-September/352579.html
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-10-09 19:15:13 +03:00
+								    /* The Netlink encoding of datapath flow keys cannot express
 								     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
 								     * tag is interpreted as exact match on the fact that there is no
 								     * VLAN.  Unless we refactor a lot of code that translates between
 								     * Netlink and struct flow representations, we have to do the same
 								     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
 								    if (!match.wc.masks.vlans[0].tci) {
 								        match.wc.masks.vlans[0].tci = htons(0xffff);
 								    }
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    /* Must produce a netdev_flow_key for lookup.
-												userspace: Switching of L3 packets in L2 pipeline

Ports have a new layer3 attribute if they send/receive L3 packets.

The packet_type included in structs dp_packet and flow is considered in
ofproto-dpif. The classical L2 match fields (dl_src, dl_dst, dl_type, and
vlan_tci, vlan_vid, vlan_pcp) now have Ethernet as pre-requisite.

A dummy ethernet header is pushed to L3 packets received from L3 ports
before the the pipeline processing starts. The ethernet header is popped
before sending a packet to a L3 port.

For datapath ports that can receive L2 or L3 packets, the packet_type
becomes part of the flow key for datapath flows and is handled
appropriately in dpif-netdev.

In the 'else' branch in flow_put_on_pmd() function, the additional check
flow_equal(&match.flow, &netdev_flow->flow) was removed, as a) the dpcls
lookup is sufficient to uniquely identify a flow and b) it caused false
negatives because the flow in netdev->flow may not properly masked.

In dpif_netdev_flow_put() we now use the same method for constructing the
netdev_flow_key as the one used when adding the flow to the dplcs to make sure
these always match. The function netdev_flow_key_from_flow() used so far was
not only inefficient but sometimes caused mismatches and subsequent flow
update failures.

The kernel datapath does not support the packet_type match field.
Instead it encodes the packet type implictly by the presence or absence of
the Ethernet attribute in the flow key and mask.
This patch filters the PACKET_TYPE attribute out of netlink flow key and
mask to be sent to the kernel datapath.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-02 16:16:17 +00:00
+								     * Use the same method as employed to create the key when adding
 								     * the flow to the dplcs to make sure they match. */
 								    netdev_flow_mask_init(&mask, &match);
 								    netdev_flow_key_init_masked(&key, &match.flow, &mask);
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								    if (put->pmd_id == PMD_ID_NULL) {
 								        if (cmap_count(&dp->poll_threads) == 0) {
 								            return EINVAL;
 								        }
 								        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								            struct dpif_flow_stats pmd_stats;
 								            int pmd_error;
 								            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
 								                                        &pmd_stats);
 								            if (pmd_error) {
 								                error = pmd_error;
 								            } else if (put->stats) {
 								                put->stats->n_packets += pmd_stats.n_packets;
 								                put->stats->n_bytes += pmd_stats.n_bytes;
 								                put->stats->used = MAX(put->stats->used, pmd_stats.used);
 								                put->stats->tcp_flags |= pmd_stats.tcp_flags;
 								            }
 								        }
 								    } else {
 								        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
 								        if (!pmd) {
 								            return EINVAL;
 								        }
 								        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
 								        dp_netdev_pmd_unref(pmd);
 								    }
 								    return error;
 								}
 								static int
 								flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
 								                struct dpif_flow_stats *stats,
 								                const struct dpif_flow_del *del)
 								{
 								    struct dp_netdev_flow *netdev_flow;
 								    int error = 0;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_lock(&pmd->flow_mutex);
 								    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
 								                                          del->key_len);
-												dpif-netdev: Change a variable name.

'struct dp_netdev_flow' is currently being instantiated as 'flow'.
An upcoming commit introduces a classifier to dpif-netdev
which uses 'struct flow' at a few places and that can cause
confusion while reading code.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-10-29 02:34:15 -07:00
+								    if (netdev_flow) {
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        if (stats) {
 								            get_dpif_flow_stats(netdev_flow, stats);
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
+								        }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    } else {
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
+								        error = ENOENT;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_unlock(&pmd->flow_mutex);
-												dpctl: Avoid making assumptions on pmd threads.

Currently dpctl depends on ovs-numa module to delete and create flows on
different pmd threads for pmd devices.

The next commits will move away the pmd threads state from ovs-numa to
dpif-netdev, so the ovs-numa interface will not be supported.

Also, the assignment between ports and thread is an implementation
detail of dpif-netdev, dpctl shouldn't know anything about it.

This commit changes the dpif_flow_put() and dpif_flow_del() calls to
iterate over all the pmd threads, if pmd_id is PMD_ID_NULL.

A simple test is added.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								    return error;
 								}
 								static int
 								dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_pmd_thread *pmd;
 								    int error = 0;
 								    if (del->stats) {
 								        memset(del->stats, 0, sizeof *del->stats);
 								    }
 								    if (del->pmd_id == PMD_ID_NULL) {
 								        if (cmap_count(&dp->poll_threads) == 0) {
 								            return EINVAL;
 								        }
 								        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								            struct dpif_flow_stats pmd_stats;
 								            int pmd_error;
 								            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
 								            if (pmd_error) {
 								                error = pmd_error;
 								            } else if (del->stats) {
 								                del->stats->n_packets += pmd_stats.n_packets;
 								                del->stats->n_bytes += pmd_stats.n_bytes;
 								                del->stats->used = MAX(del->stats->used, pmd_stats.used);
 								                del->stats->tcp_flags |= pmd_stats.tcp_flags;
 								            }
 								        }
 								    } else {
 								        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
 								        if (!pmd) {
 								            return EINVAL;
 								        }
 								        error = flow_del_on_pmd(pmd, del->stats, del);
 								        dp_netdev_pmd_unref(pmd);
 								    }
-												dpif-netdev: Make internally thread-safe by introducing a global mutex.

This can be improved later but it is the simple thing to do for now.

I marked a couple of races with XXX.  I don't have a really good solution
for these, but I hope to find one.  They may be harmless in practice.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-07-23 16:56:26 -07:00
 								    return error;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								struct dpif_netdev_flow_dump {
 								    struct dpif_flow_dump up;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct cmap_position poll_thread_pos;
 								    struct cmap_position flow_pos;
 								    struct dp_netdev_pmd_thread *cur_pmd;
-												dpif: Make dpif_flow_dump_next() thread-safe.

This patch makes it the caller's responsibility to initialize a
per-thread 'state' object and pass it down to the dpif_flow_dump_next()
implementation. The implementation can expect to be called from multiple
threads with the same 'iter' and different 'state' objects.

When flow_dump_next() returns non-zero, the implementation must ensure
that subsequent calls with the same arguments also return non-zero.
Subsequent calls with the same 'iter' and different 'state' may return
zero, but should make progress towards returning non-zero.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:08 -08:00
+								    int status;
 								    struct ovs_mutex mutex;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								};
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								static struct dpif_netdev_flow_dump *
 								dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								static struct dpif_flow_dump *
-												dpctl: Add an option to dump only certain kinds of flows

Usage:
    # to dump all datapath flows (default):
    ovs-dpctl dump-flows

    # to dump only flows that in kernel datapath:
    ovs-dpctl dump-flows type=ovs

    # to dump only flows that are offloaded:
    ovs-dpctl dump-flows type=offloaded

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2017-06-13 18:03:49 +03:00
+								dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
-												dpctl: Expand the flow dump type filter

Added new types to the flow dump filter, and allowed multiple filter
types to be passed at once, as a comma separated list. The new types
added are:
 * tc - specifies flows handled by the tc dp
 * non-offloaded - specifies flows not offloaded to the HW
 * all - specifies flows of all types

The type list is now fully parsed by the dpctl, and a new struct was
added to dpif which enables dpctl to define which types of dumps to
provide, rather than passing the type string and having dpif parse it.

Signed-off-by: Gavi Teitz <gavi@mellanox.com>
Acked-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2018-08-10 11:30:08 +03:00
+								                             struct dpif_flow_dump_types *types OVS_UNUSED)
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    struct dpif_netdev_flow_dump *dump;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dump = xzalloc(sizeof *dump);
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    dpif_flow_dump_init(&dump->up, dpif_);
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								    dump->up.terse = terse;
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_init(&dump->mutex);
 								    return &dump->up;
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								}
 								static int
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
+								{
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
-												dpif: Separate local and shared flow dump state.

This patch separates the structures for thread-local flow dump state
("state") from the shared flow dump state ("iter") in dpif-linux and
dpif-netdev. Future patches will make use of this to allow multiple
threads to dump flows from the same flow dump operation.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:07 -08:00
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_destroy(&dump->mutex);
 								    free(dump);
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								    return 0;
 								}
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								struct dpif_netdev_flow_dump_thread {
 								    struct dpif_flow_dump_thread up;
 								    struct dpif_netdev_flow_dump *dump;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
 								    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								};
 								static struct dpif_netdev_flow_dump_thread *
 								dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
 								{
 								    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
 								}
 								static struct dpif_flow_dump_thread *
 								dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
 								{
 								    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
 								    struct dpif_netdev_flow_dump_thread *thread;
 								    thread = xmalloc(sizeof *thread);
 								    dpif_flow_dump_thread_init(&thread->up, &dump->up);
 								    thread->dump = dump;
 								    return &thread->up;
 								}
 								static void
 								dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
 								{
 								    struct dpif_netdev_flow_dump_thread *thread
 								        = dpif_netdev_flow_dump_thread_cast(thread_);
 								    free(thread);
 								}
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								static int
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								                           struct dpif_flow *flows, int max_flows)
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								{
 								    struct dpif_netdev_flow_dump_thread *thread
 								        = dpif_netdev_flow_dump_thread_cast(thread_);
 								    struct dpif_netdev_flow_dump *dump = thread->dump;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
 								    int n_flows = 0;
 								    int i;
-												flow: Separate "flow_t" from "struct odp_flow_key".

The "struct odp_flow_key" used in the kernel datapath is conceptually
separate from the "flow_t" used in userspace, but until now we have
used the latter as a typedef for the former for convenience.  This commit
separates them.  This makes it possible in upcoming commits to change
them independently.

This is cross-ported from the "wdp" branch, which has had it for months.

											
										
										
											2010-10-11 13:31:35 -07:00
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_lock(&dump->mutex);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    if (!dump->status) {
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								        struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
 								        struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
 								        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
 								        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
 								        /* First call to dump_next(), extracts the first pmd thread.
 								         * If there is no pmd thread, returns immediately. */
 								        if (!pmd) {
 								            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
 								            if (!pmd) {
 								                ovs_mutex_unlock(&dump->mutex);
 								                return n_flows;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
 								            }
-												dpif: Make dpif_flow_dump_next() thread-safe.

This patch makes it the caller's responsibility to initialize a
per-thread 'state' object and pass it down to the dpif_flow_dump_next()
implementation. The implementation can expect to be called from multiple
threads with the same 'iter' and different 'state' objects.

When flow_dump_next() returns non-zero, the implementation must ensure
that subsequent calls with the same arguments also return non-zero.
Subsequent calls with the same 'iter' and different 'state' may return
zero, but should make progress towards returning non-zero.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-02-27 14:13:08 -08:00
+								        }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
 								        do {
 								            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
 								                struct cmap_node *node;
 								                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
 								                if (!node) {
 								                    break;
 								                }
 								                netdev_flows[n_flows] = CONTAINER_OF(node,
 								                                                     struct dp_netdev_flow,
 								                                                     node);
 								            }
 								            /* When finishing dumping the current pmd thread, moves to
 								             * the next. */
 								            if (n_flows < flow_limit) {
 								                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
 								                dp_netdev_pmd_unref(pmd);
 								                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
 								                if (!pmd) {
 								                    dump->status = EOF;
 								                    break;
 								                }
 								            }
 								            /* Keeps the reference to next caller. */
 								            dump->cur_pmd = pmd;
 								            /* If the current dump is empty, do not exit the loop, since the
 								             * remaining pmds could have flows to be dumped.  Just dumps again
 								             * on the new 'pmd'. */
 								        } while (!n_flows);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    }
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    ovs_mutex_unlock(&dump->mutex);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    for (i = 0; i < n_flows; i++) {
 								        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
 								        struct odputil_keybuf *keybuf = &thread->keybuf[i];
 								        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
 								        struct dpif_flow *f = &flows[i];
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								        struct ofpbuf key, mask;
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
 								        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
-												dpif: Minimize memory copy for revalidation.

One of the limiting factors on the number of flows that can be supported
in the datapath is the overhead of assembling flow dump messages in the
datapath. This patch modifies the dpif to allow revalidators to skip
dumping the key, mask and actions from the datapath, by making use of
the unique flow identifiers introduced in earlier patches.

For each flow dump, the dpif user specifies whether to skip these
attributes, allowing the common case to only dump a pair of 128-bit ID
and flow stats. With datapath support, this increases the number of
flows that a revalidator can handle per second by 50% or more. Support
in dpif-netdev and dpif-netlink is added in this patch; kernel support
is left for future patches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-06 11:14:08 +13:00
+								        dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
 								                                    dump->up.terse);
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    }
-												dpif: Eliminate "struct odp_flow" from client-visible interface.

Following this commit, "struct odp_flow" and related data structures are
only used in Linux-specific parts of OVS userspace code.  This allows the
actual Linux datapath interface to evolve more freely.

Reviewed by Justin Pettit.

											
										
										
											2011-01-26 07:03:39 -08:00
-												dpif-netdev: Implement batched flow dumping.

Previously, flows were retrieved one by one when dumping flows for
datapaths of type 'netdev'. This increased contention for the dump's
mutex, negatively affecting revalidator performance.

This patch retrieves batches of flows when dumping flows for datapaths
of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
[blp@nicira.com relaxed max_flows restriction]
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-06-23 12:36:11 -07:00
+								    return n_flows;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
 								static int
-												dpif: Use explicit packet metadata.

This helps reduce confusion about when a flow is a flow and when it is
just metadata.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    OVS_NO_THREAD_SAFETY_ANALYSIS
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch pp;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
 								        dp_packet_size(execute->packet) > UINT16_MAX) {
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								        return EINVAL;
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Tries finding the 'pmd'.  If NULL is returned, that means
 								     * the current thread is a non-pmd thread and should use
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    pmd = ovsthread_getspecific(dp->per_pmd_key);
 								    if (!pmd) {
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        if (!pmd) {
 								            return EBUSY;
 								        }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
-												dpif-netdev: Avoid sending probe packets

When ofproto probe for datapath features, no packets should actually
be sent to the network. This pactch fixes the userspace by dropping
probe packets before action execution.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-19 22:13:19 -08:00
+								    if (execute->probe) {
 								        /* If this is part of a probe, Drop the packet, since executing
 								         * the action may actually cause spurious packets be sent into
 								         * the network. */
-												dpif-netdev: Fix memory leak

Valgrind complains in test 1019 (dpctl - add-if set-if del-if):

4,850,896 (4,850,240 direct, 656 indirect) bytes in 1 blocks are
definitely lost in loss record 364 of 364
   by 0x517062: xcalloc (util.c:103)
   by 0x46CBBC: dp_netdev_set_nonpmd (dpif-netdev.c:4498)
   by 0x46CBBC: create_dp_netdev (dpif-netdev.c:1299)
   by 0x46CBBC: dpif_netdev_open (dpif-netdev.c:1337)
   by 0x472CB0: do_open (dpif.c:350)
   by 0x472E6F: dpif_create (dpif.c:404)
   by 0x472E6F: dpif_create_and_open (dpif.c:417)
   by 0x430EBC: open_dpif_backer (ofproto-dpif.c:727)
   by 0x430EBC: construct (ofproto-dpif.c:1411)
   by 0x41B714: ofproto_create (ofproto.c:539)
   by 0x40C84E: bridge_reconfigure (bridge.c:647)
   by 0x4104C5: bridge_run (bridge.c:2998)
   by 0x406FA4: main (ovs-vswitchd.c:119)

The reference count wasn't released at this earlier return.

This fix passes the test 'make check'.

Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com>
Tested-by: Greg Rose <gvrose8192@gmail.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-15 06:59:24 -08:00
+								        if (pmd->core_id == NON_PMD_CORE_ID) {
 								            dp_netdev_pmd_unref(pmd);
 								        }
-												dpif-netdev: Avoid sending probe packets

When ofproto probe for datapath features, no packets should actually
be sent to the network. This pactch fixes the userspace by dropping
probe packets before action execution.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-19 22:13:19 -08:00
+								        return 0;
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* If the current thread is non-pmd thread, acquires
 								     * the 'non_pmd_mutex'. */
 								    if (pmd->core_id == NON_PMD_CORE_ID) {
 								        ovs_mutex_lock(&dp->non_pmd_mutex);
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    /* Update current time in PMD context. We don't care about EMC insertion
 								     * probability, because we are on a slow path. */
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    pmd_thread_ctx_time_update(pmd);
-												dpif-netdev: Initialize packet RSS hash in dpif_netdev_execute().

The datapath code expects the RSS hash to always be initialized.  This
is enforced by checking in emc_processing() that the hash is valid, and
eventually by computing a new one.

Unfortunately, there is another entry point to the datapath,
dpif_netdev_execute().  A packet generated by OVS (BFD frame,
packet-out from controller) doesn't have a valid RSS hash and so is
allowed to enter the datapath with an uninitialized hash value.

This commit recomputes the hash (if not valid) in dpif_netdev_execute().

The only place where we would use an invalid hash is netdev-vport, in
push_udp_header().  This caused an uninitialized memory read, and a
random value to be assigned to the outer tunnel header source port.

Reported-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: William Tu <u9012063@gmail.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-05-17 18:38:20 -07:00
+								    /* The action processing expects the RSS hash to be valid, because
 								     * it's always initialized at the beginning of datapath processing.
 								     * In this case, though, 'execute->packet' may not have gone through
 								     * the datapath at all, it may have been generated by the upper layer
 								     * (OpenFlow packet-out, BFD frame, ...). */
 								    if (!dp_packet_rss_valid(execute->packet)) {
 								        dp_packet_set_rss_hash(execute->packet,
 								                               flow_hash_5tuple(execute->flow, 0));
 								    }
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    dp_packet_batch_init_packet(&pp, execute->packet);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                              execute->actions, execute->actions_len);
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    dp_netdev_pmd_flush_output_packets(pmd, true);
-												dpif-netdev: Initialize packet RSS hash in dpif_netdev_execute().

The datapath code expects the RSS hash to always be initialized.  This
is enforced by checking in emc_processing() that the hash is valid, and
eventually by computing a new one.

Unfortunately, there is another entry point to the datapath,
dpif_netdev_execute().  A packet generated by OVS (BFD frame,
packet-out from controller) doesn't have a valid RSS hash and so is
allowed to enter the datapath with an uninitialized hash value.

This commit recomputes the hash (if not valid) in dpif_netdev_execute().

The only place where we would use an invalid hash is netdev-vport, in
push_udp_header().  This caused an uninitialized memory read, and a
random value to be assigned to the outer tunnel header source port.

Reported-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: William Tu <u9012063@gmail.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-05-17 18:38:20 -07:00
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
 								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								        dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
-												dpif: Use explicit packet metadata.

This helps reduce confusion about when a flow is a flow and when it is
just metadata.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    return 0;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								static void
-												revalidator: Rebalance offloaded flows based on the pps rate

This is the third patch in the patch-set to support dynamic rebalancing
of offloaded flows.

The dynamic rebalancing functionality is implemented in this patch. The
ukeys that are not scheduled for deletion are obtained and passed as input
to the rebalancing routine. The rebalancing is done in the context of
revalidation leader thread, after all other revalidator threads are
done with gathering rebalancing data for flows.

For each netdev that is in OOR state, a list of flows - both offloaded
and non-offloaded (pending) - is obtained using the ukeys. For each netdev
that is in OOR state, the flows are grouped and sorted into offloaded and
pending flows.  The offloaded flows are sorted in descending order of
pps-rate, while pending flows are sorted in ascending order of pps-rate.

The rebalancing is done in two phases. In the first phase, we try to
offload all pending flows and if that succeeds, the OOR state on the device
is cleared. If some (or none) of the pending flows could not be offloaded,
then we start replacing an offloaded flow that has a lower pps-rate than
a pending flow, until there are no more pending flows with a higher rate
than an offloaded flow. The flows that are replaced from the device are
added into kernel datapath.

A new OVS configuration parameter "offload-rebalance", is added to ovsdb.
The default value of this is "false". To enable this feature, set the
value of this parameter to "true", which provides packets-per-second
rate based policy to dynamically offload and un-offload flows.

Note: This option can be enabled only when 'hw-offload' policy is enabled.
It also requires 'tc-policy' to be set to 'skip_sw'; otherwise, flow
offload errors (specifically ENOSPC error this feature depends on) reported
by an offloaded device are supressed by TC-Flower kernel module.

Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Co-authored-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
Reviewed-by: Ben Pfaff <blp@ovn.org>
Signed-off-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2018-10-18 21:43:14 +05:30
+								dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
 								                    enum dpif_offload_type offload_type OVS_UNUSED)
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								{
 								    size_t i;
 								    for (i = 0; i < n_ops; i++) {
 								        struct dpif_op *op = ops[i];
 								        switch (op->type) {
 								        case DPIF_OP_FLOW_PUT:
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								            break;
 								        case DPIF_OP_FLOW_DEL:
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								            break;
 								        case DPIF_OP_EXECUTE:
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								            op->error = dpif_netdev_execute(dpif, &op->execute);
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								            break;
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
 								        case DPIF_OP_FLOW_GET:
-												Embrace anonymous unions.

Several OVS structs contain embedded named unions, like this:

struct {
    ...
    union {
        ...
    } u;
};

C11 standardized a feature that many compilers already implemented
anyway, where an embedded union may be unnamed, like this:

struct {
    ...
    union {
        ...
    };
};

This is more convenient because it allows the programmer to omit "u."
in many places.  OVS already used this feature in several places.  This
commit embraces it in several others.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Acked-by: Alin Gabriel Serdean <aserdean@ovn.org>

											
										
										
											2018-05-24 10:32:59 -07:00
+								            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
-												dpif: Support flow_get in dpif_operate().

This cleans up the dpif interface to make it more consistent with the
other dpif operations, and allows flows to be fetched in batches.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-13 09:55:54 +12:00
+								            break;
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								        }
 								    }
 								}
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								/* Enable or Disable PMD auto load balancing. */
 								static void
 								set_pmd_auto_lb(struct dp_netdev *dp)
 								{
 								    unsigned int cnt = 0;
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
 								    bool enable_alb = false;
 								    bool multi_rxq = false;
 								    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
 								    /* Ensure that there is at least 2 non-isolated PMDs and
 								     * one of them is polling more than one rxq. */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
 								            continue;
 								        }
 								        if (hmap_count(&pmd->poll_list) > 1) {
 								            multi_rxq = true;
 								        }
 								        if (cnt && multi_rxq) {
 								                enable_alb = true;
 								                break;
 								        }
 								        cnt++;
 								    }
 								    /* Enable auto LB if it is requested and cycle based assignment is true. */
 								    enable_alb = enable_alb && pmd_rxq_assign_cyc &&
 								                    pmd_alb->auto_lb_requested;
 								    if (pmd_alb->is_enabled != enable_alb) {
 								        pmd_alb->is_enabled = enable_alb;
 								        if (pmd_alb->is_enabled) {
 								            VLOG_INFO("PMD auto load balance is enabled "
 								                      "(with rebalance interval:%"PRIu64" msec)",
 								                       pmd_alb->rebalance_intvl);
 								        } else {
 								            pmd_alb->rebalance_poll_timer = 0;
 								            VLOG_INFO("PMD auto load balance is disabled");
 								        }
 								    }
 								}
-												dpif-netdev: Pass Openvswitch other_config smap to dpif.

Currently we parse the 'other_config' column in Openvswitch table in
bridge.c.  We extract the values (just 'pmd-cpu-mask' for now) and we
pass them down to the datapath, via different layers.

If we want to pass other values to dpif-netdev.c (like we recently
discussed) we would have to touch ofproto.c, ofproto-dpif.c and dpif.c.

This patch sends the entire other_config column to dpif-netdev, so that
dpif-netdev can extract the values it's interested in.

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-27 16:41:36 -08:00
+								/* Applies datapath configuration from the database. Some of the changes are
 								 * actually applied in dpif_netdev_run(). */
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								static int
-												dpif-netdev: Pass Openvswitch other_config smap to dpif.

Currently we parse the 'other_config' column in Openvswitch table in
bridge.c.  We extract the values (just 'pmd-cpu-mask' for now) and we
pass them down to the datapath, via different layers.

If we want to pass other values to dpif-netdev.c (like we recently
discussed) we would have to touch ofproto.c, ofproto-dpif.c and dpif.c.

This patch sends the entire other_config column to dpif-netdev, so that
dpif-netdev can extract the values it's interested in.

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-27 16:41:36 -08:00
+								dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Pass Openvswitch other_config smap to dpif.

Currently we parse the 'other_config' column in Openvswitch table in
bridge.c.  We extract the values (just 'pmd-cpu-mask' for now) and we
pass them down to the datapath, via different layers.

If we want to pass other values to dpif-netdev.c (like we recently
discussed) we would have to touch ofproto.c, ofproto-dpif.c and dpif.c.

This patch sends the entire other_config column to dpif-netdev, so that
dpif-netdev can extract the values it's interested in.

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-27 16:41:36 -08:00
+								    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
 								                                             "cycles");
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								    unsigned long long insert_prob =
 								        smap_get_ullong(other_config, "emc-insert-inv-prob",
 								                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
 								    uint32_t insert_min, cur_min;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    uint32_t tx_flush_interval, cur_tx_flush_interval;
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    uint64_t rebalance_intvl;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
 								    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
 								                                     DEFAULT_TX_FLUSH_INTERVAL);
 								    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
 								    if (tx_flush_interval != cur_tx_flush_interval) {
 								        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
 								        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
 								                  tx_flush_interval);
 								    }
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
 								        free(dp->pmd_cmask);
 								        dp->pmd_cmask = nullable_xstrdup(cmask);
 								        dp_netdev_request_reconfigure(dp);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    }
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
 								    if (insert_prob <= UINT32_MAX) {
 								        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
 								    } else {
 								        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
 								        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
 								    }
 								    if (insert_min != cur_min) {
 								        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
 								        if (insert_min == 0) {
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								            VLOG_INFO("EMC insertion probability changed to zero");
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								        } else {
 								            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
 								                      insert_prob, (100 / (float)insert_prob));
 								        }
 								    }
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
 								    bool cur_perf_enabled;
 								    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
 								    if (perf_enabled != cur_perf_enabled) {
 								        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
 								        if (perf_enabled) {
 								            VLOG_INFO("PMD performance metrics collection enabled");
 								        } else {
 								            VLOG_INFO("PMD performance metrics collection disabled");
 								        }
 								    }
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
 								    bool cur_smc;
 								    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
 								    if (smc_enable != cur_smc) {
 								        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
 								        if (smc_enable) {
 								            VLOG_INFO("SMC cache is enabled");
 								        } else {
 								            VLOG_INFO("SMC cache is disabled");
 								        }
 								    }
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
 								    bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
 								    if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
 								        VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
 								                      "Defaulting to 'cycles'.");
 								        pmd_rxq_assign_cyc = true;
 								        pmd_rxq_assign = "cycles";
 								    }
 								    if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
 								        dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
 								        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
 								                  pmd_rxq_assign);
 								        dp_netdev_request_reconfigure(dp);
 								    }
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
 								    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
 								    pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
 								                              false);
 								    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
 								                              ALB_PMD_REBALANCE_POLL_INTERVAL);
 								    /* Input is in min, convert it to msec. */
 								    rebalance_intvl =
 								        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
 								    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
 								        pmd_alb->rebalance_intvl = rebalance_intvl;
 								    }
 								    set_pmd_auto_lb(dp);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								    return 0;
 								}
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								/* Parses affinity list and returns result in 'core_ids'. */
 								static int
 								parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
 								{
 								    unsigned i;
 								    char *list, *copy, *key, *value;
 								    int error = 0;
 								    for (i = 0; i < n_rxq; i++) {
-												dpif-netdev: Uses the OVS_CORE_UNSPEC instead of magic numbers.

This patch uses OVS_CORE_UNSPEC for the queue unpinned instead
of "-1". More important, the "-1" casted to unsigned int is
equal to NON_PMD_CORE_ID. We make the distinction between them.

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-01-08 17:30:22 -08:00
+								        core_ids[i] = OVS_CORE_UNSPEC;
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    }
 								    if (!affinity_list) {
 								        return 0;
 								    }
 								    list = copy = xstrdup(affinity_list);
 								    while (ofputil_parse_key_value(&list, &key, &value)) {
 								        int rxq_id, core_id;
 								        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
 								            || !str_to_int(value, 0, &core_id) || core_id < 0) {
 								            error = EINVAL;
 								            break;
 								        }
 								        if (rxq_id < n_rxq) {
 								            core_ids[rxq_id] = core_id;
 								        }
 								    }
 								    free(copy);
 								    return error;
 								}
 								/* Parses 'affinity_list' and applies configuration if it is valid. */
 								static int
 								dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
 								                                  const char *affinity_list)
 								{
 								    unsigned *core_ids, i;
 								    int error = 0;
 								    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
 								    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
 								        error = EINVAL;
 								        goto exit;
 								    }
 								    for (i = 0; i < port->n_rxq; i++) {
 								        port->rxqs[i].core_id = core_ids[i];
 								    }
 								exit:
 								    free(core_ids);
 								    return error;
 								}
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
 								 * of given PMD thread. */
 								static bool
 								dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
 								                           struct dp_netdev_port *port)
 								    OVS_EXCLUDED(pmd->port_mutex)
 								{
 								    struct rxq_poll *poll;
 								    bool found = false;
 								    ovs_mutex_lock(&pmd->port_mutex);
 								    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
 								        if (port == poll->rxq->port) {
 								            found = true;
 								            break;
 								        }
 								    }
 								    ovs_mutex_unlock(&pmd->port_mutex);
 								    return found;
 								}
 								/* Updates port configuration from the database.  The changes are actually
 								 * applied in dpif_netdev_run(). */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								static int
 								dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
 								                            const struct smap *cfg)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_port *port;
 								    int error = 0;
 								    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
 								    ovs_mutex_lock(&dp->port_mutex);
 								    error = get_port_by_number(dp, port_no, &port);
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    if (error) {
 								        goto unlock;
 								    }
 								    if (emc_enabled != port->emc_enabled) {
 								        struct dp_netdev_pmd_thread *pmd;
 								        struct ds ds = DS_EMPTY_INITIALIZER;
 								        uint32_t cur_min, insert_prob;
 								        port->emc_enabled = emc_enabled;
 								        /* Mark for reload all the threads that polls this port and request
 								         * for reconfiguration for the actual reloading of threads. */
 								        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								            if (dpif_netdev_pmd_polls_port(pmd, port)) {
 								                pmd->need_reload = true;
 								            }
 								        }
 								        dp_netdev_request_reconfigure(dp);
 								        ds_put_format(&ds, "%s: EMC has been %s.",
 								                      netdev_get_name(port->netdev),
 								                      (emc_enabled) ? "enabled" : "disabled");
 								        if (emc_enabled) {
 								            ds_put_cstr(&ds, " Current insertion probability is ");
 								            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
 								            if (!cur_min) {
 								                ds_put_cstr(&ds, "zero.");
 								            } else {
 								                insert_prob = UINT32_MAX / cur_min;
 								                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
 								                              insert_prob, 100 / (float) insert_prob);
 								            }
 								        }
 								        VLOG_INFO("%s", ds_cstr(&ds));
 								        ds_destroy(&ds);
 								    }
 								    /* Checking for RXq affinity changes. */
 								    if (!netdev_is_pmd(port->netdev)
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								        || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
 								        goto unlock;
 								    }
 								    error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
 								    if (error) {
 								        goto unlock;
 								    }
 								    free(port->rxq_affinity_list);
 								    port->rxq_affinity_list = nullable_xstrdup(affinity_list);
 								    dp_netdev_request_reconfigure(dp);
 								unlock:
 								    ovs_mutex_unlock(&dp->port_mutex);
 								    return error;
 								}
-												dpif-netdev: Allow enqueue actions.

The dpif-netdev implementation disallowed enqueue actions because
it did not support conversion from OVS 'queue_id' to dpif
'priority'.  For testing purposes, this patch allows queues which
translate into NOOPs.

											
										
										
											2011-11-21 13:36:17 -08:00
+								static int
 								dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
 								                              uint32_t queue_id, uint32_t *priority)
 								{
 								    *priority = queue_id;
 								    return 0;
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
-												Fix coding style and some typos.

Fixes some lines exceeding 80 chars and a couple of typos.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-06-23 13:28:20 +01:00
+								 * a copy of the 'size' bytes of 'actions' input parameters. */
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								struct dp_netdev_actions *
 								dp_netdev_actions_create(const struct nlattr *actions, size_t size)
 								{
 								    struct dp_netdev_actions *netdev_actions;
-												dpif-netdev: Store actions data and size contiguously.

As stated by the comment above the structure, the 'action' pointer does not
change during the 'dp_netdev_actions' lifetime: we might as well embed
the pointed memory into the structure.

The commit also updates the description of dp_netdev_actions_create().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:46 +01:00
+								    netdev_actions = xmalloc(sizeof *netdev_actions + size);
 								    memcpy(netdev_actions->actions, actions, size);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								    netdev_actions->size = size;
 								    return netdev_actions;
 								}
 								struct dp_netdev_actions *
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								{
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								}
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								static void
 								dp_netdev_actions_free(struct dp_netdev_actions *actions)
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								{
-												dpif-netdev: Use RCU to protect data.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-05 22:41:30 -08:00
+								    free(actions);
-												dpif-netdev: Break actions out into new struct dp_netdev_actions.

This is analogous to the split between rule and rule_actions in
ofproto.  As there, it will allow retaining a reference to a rule's
actions, while processing them, without having to retain a reference
to the rule itself.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 14:37:13 -08:00
+								}
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								static void
 								dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 								                         enum rxq_cycles_counter_type type,
 								                         unsigned long long cycles)
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								{
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								   atomic_store_relaxed(&rx->cycles[type], cycles);
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								}
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								static void
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								                         enum rxq_cycles_counter_type type,
 								                         unsigned long long cycles)
 								{
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								    non_atomic_ullong_add(&rx->cycles[type], cycles);
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								}
 								static uint64_t
 								dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 								                         enum rxq_cycles_counter_type type)
 								{
 								    unsigned long long processing_cycles;
 								    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
 								    return processing_cycles;
 								}
 								static void
 								dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 								                                unsigned long long cycles)
 								{
-												dpif-netdev: Avoid side-effect in argument of atomic_store_relaxed().

Some of the implementations of atomic_store_relaxed() evaluate their
first argument more than once, so arguments with side effects cause
strange behavior.  This fixes a problem observed on 64-bit Windows.

Reported-by: Alin Serdean <aserdean@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Alin Serdean <aserdean@ovn.org>
Acked-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-06 08:57:33 -07:00
+								    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
 								    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								}
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								static uint64_t
 								dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
 								{
 								    unsigned long long processing_cycles;
 								    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
 								    return processing_cycles;
 								}
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								#if ATOMIC_ALWAYS_LOCK_FREE_8B
 								static inline bool
 								pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
 								{
 								    bool pmd_perf_enabled;
 								    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
 								    return pmd_perf_enabled;
 								}
 								#else
 								/* If stores and reads of 64-bit integers are not atomic, the full PMD
 								 * performance metrics are not available as locked access to 64 bit
 								 * integers would be prohibitively expensive. */
 								static inline bool
 								pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
 								{
 								    return false;
 								}
 								#endif
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								static int
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
 								                                   struct tx_port *p)
 								{
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    int i;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								    int tx_qid;
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
+								    int output_cnt;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								    bool dynamic_txqs;
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    struct cycle_timer timer;
 								    uint64_t cycles;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    uint32_t tx_flush_interval;
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
 								    cycle_timer_start(&pmd->perf_stats, &timer);
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
 								    dynamic_txqs = p->port->dynamic_txqs;
 								    if (dynamic_txqs) {
 								        tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
 								    } else {
 								        tx_qid = pmd->static_tx_qid;
 								    }
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
+								    output_cnt = dp_packet_batch_size(&p->output_pkts);
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    ovs_assert(output_cnt > 0);
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
-												netdev: Remove unused may_steal.

Not needed anymore because 'may_steal' already handled on
dpif-netdev layer and always true.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:25 +03:00
+								    netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								    dp_packet_batch_init(&p->output_pkts);
-												dpif-netdev: Count sent packets and batches.

New statistics for 'pmd-stats-show' command:
average number of packets per output batch.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-20 18:22:32 +03:00
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    /* Update time of the next flush. */
 								    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
 								    p->flush_time = pmd->ctx.now + tx_flush_interval;
 								    ovs_assert(pmd->n_output_batches > 0);
 								    pmd->n_output_batches--;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
 								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
 								    /* Distribute send cycles evenly among transmitted packets and assign to
 								     * their respective rx queues. */
 								    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
 								    for (i = 0; i < output_cnt; i++) {
 								        if (p->output_pkts_rxqs[i]) {
 								            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
 								                                     RXQ_CYCLES_PROC_CURR, cycles);
 								        }
 								    }
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
 								    return output_cnt;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								}
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								static int
 								dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 								                                   bool force)
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								{
 								    struct tx_port *p;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    int output_cnt = 0;
 								    if (!pmd->n_output_batches) {
 								        return 0;
 								    }
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
 								    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								        if (!dp_packet_batch_is_empty(&p->output_pkts)
 								            && (force || pmd->ctx.now >= p->flush_time)) {
 								            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								        }
 								    }
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    return output_cnt;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								}
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								static int
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								                           struct dp_netdev_rxq *rxq,
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                           odp_port_t port_no)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    struct pmd_perf_stats *s = &pmd->perf_stats;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch batch;
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								    struct cycle_timer timer;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    int error;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    int batch_cnt = 0;
 								    int rem_qlen = 0, *qlen_p = NULL;
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    uint64_t cycles;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								    /* Measure duration for polling and processing rx burst. */
 								    cycle_timer_start(&pmd->perf_stats, &timer);
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
 								    pmd->ctx.last_rxq = rxq;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_packet_batch_init(&batch);
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    /* Fetch the rx queue length only for vhostuser ports. */
 								    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
 								        qlen_p = &rem_qlen;
 								    }
 								    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    if (!error) {
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								        /* At least one packet received. */
-												lib/dpif-netdev: Make emc_mutex recursive.

dpif_netdev_execute may be called while doing upcall processing.
Since the context of the input port is not tracked upto this point, we
use the shared dp->emc_cache for packet execution, where the emc_cache
is needed for recirculation.

While recursive mutexes can make thread safety analysis hard, for now
we change emc_mutex to be recursive.  Forthcoming new unit tests will
fail with the current non-recursive mutex.  Later improvements may
remove the need for this recursion.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>
											
										
										
											2014-09-08 15:33:00 -07:00
+								        *recirc_depth_get() = 0;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								        pmd_thread_ctx_time_update(pmd);
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								        batch_cnt = batch.count;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        if (pmd_perf_metrics_enabled(pmd)) {
 								            /* Update batch histogram. */
 								            s->current.batches++;
 								            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
 								            /* Update the maximum vhost rx queue fill level. */
 								            if (rxq->is_vhost && rem_qlen >= 0) {
 								                uint32_t qfill = batch_cnt + rem_qlen;
 								                if (qfill > s->current.max_vhost_qfill) {
 								                    s->current.max_vhost_qfill = qfill;
 								                }
 								            }
 								        }
 								        /* Process packet batch. */
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        dp_netdev_input(pmd, &batch, port_no);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								        /* Assign processing cycles to rx queue. */
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        dp_netdev_pmd_flush_output_packets(pmd, false);
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								    } else {
 								        /* Discard cycles. */
 								        cycle_timer_stop(&pmd->perf_stats, &timer);
 								        if (error != EAGAIN && error != EOPNOTSUPP) {
 								            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 								            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
 								                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
 								        }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    pmd->ctx.last_rxq = NULL;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    return batch_cnt;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static struct tx_port *
 								tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
 								{
 								    struct tx_port *tx;
 								    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
 								        if (tx->port->port_no == port_no) {
 								            return tx;
 								        }
 								    }
 								    return NULL;
 								}
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								static int
 								port_reconfigure(struct dp_netdev_port *port)
 								{
 								    struct netdev *netdev = port->netdev;
 								    int i, err;
 								    /* Closes the existing 'rxq's. */
 								    for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        netdev_rxq_close(port->rxqs[i].rx);
 								        port->rxqs[i].rx = NULL;
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    }
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								    unsigned last_nrxq = port->n_rxq;
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    port->n_rxq = 0;
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								    /* Allows 'netdev' to apply the pending configuration changes. */
-												netdev-dpdk: Don't use PMD driver if not configured successfully

When initialization of the DPDK PMD driver fails
(dpdk_eth_dev_init()), the reconfigure_datapath() function will remove
the port from dp_netdev, and the port is not used.

Now when bridge_reconfigure() is called again, no changes to the
previous failing netdev configuration are detected and therefore the
ports gets added to dp_netdev and used uninitialized. This is causing
exceptions...

The fix has two parts to it. First in netdev-dpdk.c we remember if the
DPDK port was started or not, and when calling
netdev_dpdk_reconfigure() we also try re-initialization if the port
was not already active. The second part of the change is in
dpif-netdev.c where it makes sure netdev_reconfigure() is called if
the port needs reconfiguration, as netdev_is_reconf_required() is only
true until netdev_reconfigure() is called (even if it fails).

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-05-16 16:15:34 +02:00
+								    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        err = netdev_reconfigure(netdev);
 								        if (err && (err != EOPNOTSUPP)) {
 								            VLOG_ERR("Failed to set interface %s new configuration",
 								                     netdev_get_name(netdev));
 								            return err;
 								        }
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    }
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    port->rxqs = xrealloc(port->rxqs,
 								                          sizeof *port->rxqs * netdev_n_rxq(netdev));
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Realloc 'used' counters for tx queues. */
 								    free(port->txq_used);
 								    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    for (i = 0; i < netdev_n_rxq(netdev); i++) {
-												dpif-netdev: Initialize new rxqs in port_reconfigure().

valgrind reported use of uninitialized data in port_reconfigure(), which
was due to xrealloc() not initializing the newly added data, combined with
dp_netdev_rxq_set_intrvl_cycles() reading 'intrvl_idx' from the added data.
This avoids the warning.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>

											
										
										
											2017-10-26 16:49:01 -07:00
+								        bool new_queue = i >= last_nrxq;
 								        if (new_queue) {
 								            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
 								        }
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        port->rxqs[i].port = port;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
-												dpif-netdev: Initialize new rxqs in port_reconfigure().

valgrind reported use of uninitialized data in port_reconfigure(), which
was due to xrealloc() not initializing the newly added data, combined with
dp_netdev_rxq_set_intrvl_cycles() reading 'intrvl_idx' from the added data.
This avoids the warning.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>

											
										
										
											2017-10-26 16:49:01 -07:00
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        if (err) {
 								            return err;
 								        }
 								        port->n_rxq++;
 								    }
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    /* Parse affinity list to apply configuration for new queues. */
 								    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
-												netdev-dpdk: Don't use PMD driver if not configured successfully

When initialization of the DPDK PMD driver fails
(dpdk_eth_dev_init()), the reconfigure_datapath() function will remove
the port from dp_netdev, and the port is not used.

Now when bridge_reconfigure() is called again, no changes to the
previous failing netdev configuration are detected and therefore the
ports gets added to dp_netdev and used uninitialized. This is causing
exceptions...

The fix has two parts to it. First in netdev-dpdk.c we remember if the
DPDK port was started or not, and when calling
netdev_dpdk_reconfigure() we also try re-initialization if the port
was not already active. The second part of the change is in
dpif-netdev.c where it makes sure netdev_reconfigure() is called if
the port needs reconfiguration, as netdev_is_reconf_required() is only
true until netdev_reconfigure() is called (even if it fails).

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Tested-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-05-16 16:15:34 +02:00
+								    /* If reconfiguration was successful mark it as such, so we can use it */
 								    port->need_reconfigure = false;
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								    return 0;
 								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								struct rr_numa_list {
 								    struct hmap numas;  /* Contains 'struct rr_numa' */
 								};
 								struct rr_numa {
 								    struct hmap_node node;
 								    int numa_id;
 								    /* Non isolated pmds on numa node 'numa_id' */
 								    struct dp_netdev_pmd_thread **pmds;
 								    int n_pmds;
 								    int cur_index;
-												dpif-netdev: Change pmd selection order.

Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:51:18 -07:00
+								    bool idx_inc;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								};
 								static struct rr_numa *
 								rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 								{
 								    struct rr_numa *numa;
 								    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
 								        if (numa->numa_id == numa_id) {
 								            return numa;
 								        }
 								    }
 								    return NULL;
 								}
-												dpif-netdev: Assign ports to pmds on non-local numa node.

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 14:38:43 -07:00
+								/* Returns the next node in numa list following 'numa' in round-robin fashion.
 								 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
 								 * Returns NULL if 'rr' numa list is empty. */
 								static struct rr_numa *
 								rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
 								{
 								    struct hmap_node *node = NULL;
 								    if (numa) {
 								        node = hmap_next(&rr->numas, &numa->node);
 								    }
 								    if (!node) {
 								        node = hmap_first(&rr->numas);
 								    }
 								    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
 								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void
 								rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct rr_numa *numa;
 								    hmap_init(&rr->numas);
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
 								            continue;
 								        }
 								        numa = rr_numa_list_lookup(rr, pmd->numa_id);
 								        if (!numa) {
 								            numa = xzalloc(sizeof *numa);
 								            numa->numa_id = pmd->numa_id;
 								            hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
 								        }
 								        numa->n_pmds++;
 								        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 								        numa->pmds[numa->n_pmds - 1] = pmd;
-												dpif-netdev: Change pmd selection order.

Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:51:18 -07:00
+								        /* At least one pmd so initialise curr_idx and idx_inc. */
 								        numa->cur_index = 0;
 								        numa->idx_inc = true;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    }
 								}
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								/*
 								 * Returns the next pmd from the numa node.
 								 *
 								 * If 'updown' is 'true' it will alternate between selecting the next pmd in
 								 * either an up or down walk, switching between up/down when the first or last
 								 * core is reached. e.g. 1,2,3,3,2,1,1,2...
 								 *
 								 * If 'updown' is 'false' it will select the next pmd wrapping around when last
 								 * core reached. e.g. 1,2,3,1,2,3,1,2...
 								 */
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static struct dp_netdev_pmd_thread *
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								rr_numa_get_pmd(struct rr_numa *numa, bool updown)
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								{
-												dpif-netdev: Change pmd selection order.

Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:51:18 -07:00
+								    int numa_idx = numa->cur_index;
 								    if (numa->idx_inc == true) {
 								        /* Incrementing through list of pmds. */
 								        if (numa->cur_index == numa->n_pmds-1) {
 								            /* Reached the last pmd. */
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								            if (updown) {
 								                numa->idx_inc = false;
 								            } else {
 								                numa->cur_index = 0;
 								            }
-												dpif-netdev: Change pmd selection order.

Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:51:18 -07:00
+								        } else {
 								            numa->cur_index++;
 								        }
 								    } else {
 								        /* Decrementing through list of pmds. */
 								        if (numa->cur_index == 0) {
 								            /* Reached the first pmd. */
 								            numa->idx_inc = true;
 								        } else {
 								            numa->cur_index--;
 								        }
 								    }
 								    return numa->pmds[numa_idx];
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								}
 								static void
 								rr_numa_list_destroy(struct rr_numa_list *rr)
 								{
 								    struct rr_numa *numa;
 								    HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
 								        free(numa->pmds);
 								        free(numa);
 								    }
 								    hmap_destroy(&rr->numas);
 								}
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								/* Sort Rx Queues by the processing cycles they are consuming. */
 								static int
-												dpif-netdev: Rename rxq_cycle_sort to compare_rxq_cycles.

This function is used for comparison between queues
as part of the sort. It does not do the sort itself.
As such, give it a more appropriate name.

Suggested-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Billy O'Mahony
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:56 +00:00
+								compare_rxq_cycles(const void *a, const void *b)
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								{
-												dpif-netdev: Fix a couple of coding style issues.

A couple of trivial fixes for a ternery operator placement
and pointer declaration.

Fixes: 655856ef39b9 ("dpif-netdev: Change rxq_scheduling to use rxq processing cycles.")
Fixes: a2ac666d5265 ("dpif-netdev: Change definitions of 'idle' & 'processing' cycles")
Cc: ciara.loftus@intel.com
Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 14:03:18 -07:00
+								    struct dp_netdev_rxq *qa;
 								    struct dp_netdev_rxq *qb;
-												dpif-netdev: Calculate rxq cycles prior to compare_rxq_cycles calls.

compare_rxq_cycles sums the latest cycles from each queue for
comparison with each other. While each comparison correctly
gets the latest cycles, the cycles could change between calls
to compare_rxq_cycle. In order to use consistent values through
each call of compare_rxq_cycles, sum the cycles before qsort is
called.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:57 +00:00
+								    uint64_t cycles_qa, cycles_qb;
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
 								    qa = *(struct dp_netdev_rxq **) a;
 								    qb = *(struct dp_netdev_rxq **) b;
-												dpif-netdev: Calculate rxq cycles prior to compare_rxq_cycles calls.

compare_rxq_cycles sums the latest cycles from each queue for
comparison with each other. While each comparison correctly
gets the latest cycles, the cycles could change between calls
to compare_rxq_cycle. In order to use consistent values through
each call of compare_rxq_cycles, sum the cycles before qsort is
called.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:57 +00:00
+								    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
 								    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
-												dpif-netdev: Calculate rxq cycles prior to compare_rxq_cycles calls.

compare_rxq_cycles sums the latest cycles from each queue for
comparison with each other. While each comparison correctly
gets the latest cycles, the cycles could change between calls
to compare_rxq_cycle. In order to use consistent values through
each call of compare_rxq_cycles, sum the cycles before qsort is
called.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:57 +00:00
+								    if (cycles_qa != cycles_qb) {
 								        return (cycles_qa < cycles_qb) ? 1 : -1;
-												dpif-netdev: Add port/queue tiebreaker to rxq_cycle_sort.

rxq_cycle_sort is used to compare rx queues by their measured number
of cycles. In the event that they are equal, 0 could be returned.
However, it is observed that returning 0 results in a different sort
order on Windows/Linux. This is ok in practice but it causes a unit
test failure for
"1007: PMD - pmd-cpu-mask/distribution of rx queues" when running
on different OS's.

In order to have a consistent sort result across multiple OS's,
introduce a tiebreaker of port/queue.

Fixes: 655856ef39b9 ("dpif-netdev: Change rxq_scheduling to use rxq processing cycles.")
Reported-by: Alin Gabriel Serdean <aserdean@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:55 +00:00
+								    } else {
 								        /* Cycles are the same so tiebreak on port/queue id.
 								         * Tiebreaking (as opposed to return 0) ensures consistent
 								         * sort results across multiple OS's. */
-												dpif-netdev: Avoid "sparse" warning.

"sparse" warns when odp_port_t is used directly in an inequality
comparison.  This avoids the warning.

CC: Kevin Traynor <ktraynor@redhat.com>
Fixes: a130f1a89bd8 ("dpif-netdev: Add port/queue tiebreaker to rxq_cycle_sort.")
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-11 10:34:01 -08:00
+								        uint32_t port_qa = odp_to_u32(qa->port->port_no);
 								        uint32_t port_qb = odp_to_u32(qb->port->port_no);
 								        if (port_qa != port_qb) {
 								            return port_qa > port_qb ? 1 : -1;
-												dpif-netdev: Add port/queue tiebreaker to rxq_cycle_sort.

rxq_cycle_sort is used to compare rx queues by their measured number
of cycles. In the event that they are equal, 0 could be returned.
However, it is observed that returning 0 results in a different sort
order on Windows/Linux. This is ok in practice but it causes a unit
test failure for
"1007: PMD - pmd-cpu-mask/distribution of rx queues" when running
on different OS's.

In order to have a consistent sort result across multiple OS's,
introduce a tiebreaker of port/queue.

Fixes: 655856ef39b9 ("dpif-netdev: Change rxq_scheduling to use rxq processing cycles.")
Reported-by: Alin Gabriel Serdean <aserdean@ovn.org>
Tested-by: Alin Gabriel Serdean <aserdean@ovn.org>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:55 +00:00
+								        } else {
 								            return netdev_rxq_get_queue_id(qa->rx)
 								                    - netdev_rxq_get_queue_id(qb->rx);
 								        }
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								    }
 								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
 								 * queues and marks the pmds as isolated.  Otherwise, assign non isolated
 								 * pmds to unpinned queues.
 								 *
 								 * The function doesn't touch the pmd threads, it just stores the assignment
 								 * in the 'pmd' member of each rxq. */
 								static void
 								rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    struct rr_numa_list rr;
-												dpif-netdev: Assign ports to pmds on non-local numa node.

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 14:38:43 -07:00
+								    struct rr_numa *non_local_numa = NULL;
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								    struct dp_netdev_rxq ** rxqs = NULL;
-												Don't shadow iterator values.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 12:32:29 -08:00
+								    int n_rxqs = 0;
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								    struct rr_numa *numa = NULL;
 								    int numa_id;
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								    bool assign_cyc = dp->pmd_rxq_assign_cyc;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (!netdev_is_pmd(port->netdev)) {
 								            continue;
 								        }
 								        for (int qid = 0; qid < port->n_rxq; qid++) {
 								            struct dp_netdev_rxq *q = &port->rxqs[qid];
 								            if (pinned && q->core_id != OVS_CORE_UNSPEC) {
 								                struct dp_netdev_pmd_thread *pmd;
 								                pmd = dp_netdev_get_pmd(dp, q->core_id);
 								                if (!pmd) {
 								                    VLOG_WARN("There is no PMD thread on core %d. Queue "
 								                              "%d on port \'%s\' will not be polled.",
 								                              q->core_id, qid, netdev_get_name(port->netdev));
 								                } else {
 								                    q->pmd = pmd;
 								                    pmd->isolated = true;
 								                    dp_netdev_pmd_unref(pmd);
 								                }
 								            } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
-												dpif-netdev: Calculate rxq cycles prior to compare_rxq_cycles calls.

compare_rxq_cycles sums the latest cycles from each queue for
comparison with each other. While each comparison correctly
gets the latest cycles, the cycles could change between calls
to compare_rxq_cycle. In order to use consistent values through
each call of compare_rxq_cycles, sum the cycles before qsort is
called.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:57 +00:00
+								                uint64_t cycle_hist = 0;
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								                if (n_rxqs == 0) {
 								                    rxqs = xmalloc(sizeof *rxqs);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                } else {
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								                    rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                }
-												dpif-netdev: Calculate rxq cycles prior to compare_rxq_cycles calls.

compare_rxq_cycles sums the latest cycles from each queue for
comparison with each other. While each comparison correctly
gets the latest cycles, the cycles could change between calls
to compare_rxq_cycle. In order to use consistent values through
each call of compare_rxq_cycles, sum the cycles before qsort is
called.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:57 +00:00
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								                if (assign_cyc) {
 								                    /* Sum the queue intervals and store the cycle history. */
 								                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
 								                        cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
 								                    }
 								                    dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
 								                                             cycle_hist);
 								                }
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								                /* Store the queue. */
 								                rxqs[n_rxqs++] = q;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            }
 								        }
 								    }
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								    if (n_rxqs > 1 && assign_cyc) {
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								        /* Sort the queues in order of the processing cycles
 								         * they consumed during their last pmd interval. */
-												dpif-netdev: Rename rxq_cycle_sort to compare_rxq_cycles.

This function is used for comparison between queues
as part of the sort. It does not do the sort itself.
As such, give it a more appropriate name.

Suggested-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Billy O'Mahony
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-11-23 19:41:56 +00:00
+								        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								    }
 								    rr_numa_list_populate(dp, &rr);
 								    /* Assign the sorted queues to pmds in round robin. */
-												Don't shadow iterator values.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 12:32:29 -08:00
+								    for (int i = 0; i < n_rxqs; i++) {
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								        numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
 								        numa = rr_numa_list_lookup(&rr, numa_id);
 								        if (!numa) {
 								            /* There are no pmds on the queue's local NUMA node.
 								               Round robin on the NUMA nodes that do have pmds. */
 								            non_local_numa = rr_numa_list_next(&rr, non_local_numa);
 								            if (!non_local_numa) {
 								                VLOG_ERR("There is no available (non-isolated) pmd "
 								                         "thread for port \'%s\' queue %d. This queue "
 								                         "will not be polled. Is pmd-cpu-mask set to "
 								                         "zero? Or are all PMDs isolated to other "
 								                         "queues?", netdev_rxq_get_name(rxqs[i]->rx),
 								                         netdev_rxq_get_queue_id(rxqs[i]->rx));
 								                continue;
 								            }
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								            rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								            VLOG_WARN("There's no available (non-isolated) pmd thread "
 								                      "on numa node %d. Queue %d on port \'%s\' will "
 								                      "be assigned to the pmd on core %d "
 								                      "(numa node %d). Expect reduced performance.",
 								                      numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
 								                      netdev_rxq_get_name(rxqs[i]->rx),
 								                      rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
 								        } else {
-												dpif-netdev: Add round-robin based rxq to pmd assignment.

Prior to OVS 2.9 automatic assignment of Rxqs to PMDs
(i.e. CPUs) was done by round-robin.

That was changed in OVS 2.9 to ordering the Rxqs based on
their measured processing cycles. This was to assign the
busiest Rxqs to different PMDs, improving aggregate
throughput.

For the most part the new scheme should be better, but
there could be situations where a user prefers a simple
round-robin scheme because Rxqs from a single port are
more likely to be spread across multiple PMDs, and/or
traffic is very bursty/unpredictable.

Add 'pmd-rxq-assign' config to allow a user to select
round-robin based assignment.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-08-31 09:47:55 +01:00
+								            rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
 								            if (assign_cyc) {
 								                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
 								                          "rx queue %d "
 								                          "(measured processing cycles %"PRIu64").",
 								                          rxqs[i]->pmd->core_id, numa_id,
 								                          netdev_rxq_get_name(rxqs[i]->rx),
 								                          netdev_rxq_get_queue_id(rxqs[i]->rx),
 								                          dp_netdev_rxq_get_cycles(rxqs[i],
 								                                                   RXQ_CYCLES_PROC_HIST));
 								            } else {
 								                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
 								                          "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
 								                          netdev_rxq_get_name(rxqs[i]->rx),
 								                          netdev_rxq_get_queue_id(rxqs[i]->rx));
 								            }
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								        }
 								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    rr_numa_list_destroy(&rr);
-												dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:48:01 -07:00
+								    free(rxqs);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								}
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								static void
 								reload_affected_pmds(struct dp_netdev *dp)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (pmd->need_reload) {
-												dpif-netdev: associate flow with a mark id

Most modern NICs have the ability to bind a flow with a mark, so that
every packet matches such flow will have that mark present in its
descriptor.

The basic idea of doing that is, when we receives packets later, we could
directly get the flow from the mark. That could avoid some very costly
CPU operations, including (but not limiting to) miniflow_extract, emc
lookup, dpcls lookup, etc. Thus, performance could be greatly improved.

Thus, the major work of this patch is to associate a flow with a mark
id (an uint32_t number). The association in netdev datapath is done
by CMAP, while in hardware it's done by the rte_flow MARK action.

One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
case there is only one phys port but with 2 queues, there could be 2
PMDs. In other words, even for a single mega flow (i.e. udp,tp_src=1000),
there could be 2 different dp_netdev flows, one for each PMD. That could
results to the same mega flow being offloaded twice in the hardware,
worse, we may get 2 different marks and only the last one will work.

To avoid that, a megaflow_to_mark CMAP is created. An entry will be
added for the first PMD that wants to offload a flow. For later PMDs,
it will see such megaflow is already offloaded, then the flow will not
be offloaded to HW twice.

Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
what the mark_to_flow CMAP is for. When the first PMD wants to offload
a flow, it allocates a new mark and performs the flow offload by reusing
the ->flow_put method. When it succeeds, a "mark to flow" entry will be
added. For later PMDs, it will get the corresponding mark by above
megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.

Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:03 +03:00
+								            flow_mark_flush(pmd);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								            dp_netdev_reload_pmd__(pmd);
 								            pmd->need_reload = false;
 								        }
 								    }
 								}
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								static void
 								reconfigure_pmd_threads(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct dp_netdev_pmd_thread *pmd;
 								    struct ovs_numa_dump *pmd_cores;
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    struct ovs_numa_info_core *core;
 								    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
 								    struct hmapx_node *node;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    bool changed = false;
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    bool need_to_adjust_static_tx_qids = false;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								    /* The pmd threads should be started only if there's a pmd port in the
 								     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
 								     * NR_PMD_THREADS per numa node. */
 								    if (!has_pmd_port(dp)) {
 								        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
 								    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
 								        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
 								    } else {
 								        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
 								    }
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    /* We need to adjust 'static_tx_qid's only if we're reducing number of
 								     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
 								    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
 								        /* Adjustment is required to keep 'static_tx_qid's sequential and
 								         * avoid possible issues, for example, imbalanced tx queue usage
 								         * and unnecessary locking caused by remapping on netdev level. */
 								        need_to_adjust_static_tx_qids = true;
 								    }
 								    /* Check for unwanted pmd threads */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        if (pmd->core_id == NON_PMD_CORE_ID) {
 								            continue;
 								        }
 								        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
 								                                                    pmd->core_id)) {
 								            hmapx_add(&to_delete, pmd);
 								        } else if (need_to_adjust_static_tx_qids) {
 								            pmd->need_reload = true;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        }
 								    }
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    HMAPX_FOR_EACH (node, &to_delete) {
 								        pmd = (struct dp_netdev_pmd_thread *) node->data;
 								        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
 								                  pmd->numa_id, pmd->core_id);
 								        dp_netdev_del_pmd(dp, pmd);
 								    }
 								    changed = !hmapx_is_empty(&to_delete);
 								    hmapx_destroy(&to_delete);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    if (need_to_adjust_static_tx_qids) {
 								        /* 'static_tx_qid's are not sequential now.
 								         * Reload remaining threads to fix this. */
 								        reload_affected_pmds(dp);
 								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    /* Check for required new pmd threads */
 								    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
 								        pmd = dp_netdev_get_pmd(dp, core->core_id);
 								        if (!pmd) {
 								            pmd = xzalloc(sizeof *pmd);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
 								            pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
 								                      pmd->numa_id, pmd->core_id);
 								            changed = true;
 								        } else {
 								            dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        }
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    }
 								    if (changed) {
 								        struct ovs_numa_info_numa *numa;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								        /* Log the number of pmd threads per numa node. */
 								        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                      numa->n_cores, numa->numa_id);
 								        }
 								    }
 								    ovs_numa_dump_destroy(pmd_cores);
 								}
 								static void
 								pmd_remove_stale_ports(struct dp_netdev *dp,
 								                       struct dp_netdev_pmd_thread *pmd)
 								    OVS_EXCLUDED(pmd->port_mutex)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct rxq_poll *poll, *poll_next;
 								    struct tx_port *tx, *tx_next;
 								    ovs_mutex_lock(&pmd->port_mutex);
 								    HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
 								        struct dp_netdev_port *port = poll->rxq->port;
 								        if (port->need_reconfigure
 								            || !hmap_contains(&dp->ports, &port->node)) {
 								            dp_netdev_del_rxq_from_pmd(pmd, poll);
 								        }
 								    }
 								    HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
 								        struct dp_netdev_port *port = tx->port;
 								        if (port->need_reconfigure
 								            || !hmap_contains(&dp->ports, &port->node)) {
 								            dp_netdev_del_port_tx_from_pmd(pmd, tx);
 								        }
 								    }
 								    ovs_mutex_unlock(&pmd->port_mutex);
 								}
 								/* Must be called each time a port is added/removed or the cmask changes.
 								 * This creates and destroys pmd threads, reconfigures ports, opens their
 								 * rxqs and assigns all rxqs/txqs to pmd threads. */
 								static void
 								reconfigure_datapath(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct dp_netdev_port *port;
 								    int wanted_txqs;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
 								     * on the system and the user configuration. */
 								    reconfigure_pmd_threads(dp);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    wanted_txqs = cmap_count(&dp->poll_threads);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    /* The number of pmd threads might have changed, or a port can be new:
 								     * adjust the txqs. */
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        netdev_set_tx_multiq(port->netdev, wanted_txqs);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    /* Step 2: Remove from the pmd threads ports that have been removed or
 								     * need reconfiguration. */
 								    /* Check for all the ports that need reconfiguration.  We cache this in
-												dpif-netdev: Fix few comments.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-07-06 14:12:00 +03:00
+								     * 'port->need_reconfigure', because netdev_is_reconf_required() can
 								     * change at any time. */
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (netdev_is_reconf_required(port->netdev)) {
 								            port->need_reconfigure = true;
 								        }
 								    }
 								    /* Remove from the pmd threads all the ports that have been deleted or
 								     * need reconfiguration. */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        pmd_remove_stale_ports(dp, pmd);
 								    }
 								    /* Reload affected pmd threads.  We must wait for the pmd threads before
 								     * reconfiguring the ports, because a port cannot be reconfigured while
 								     * it's being used. */
 								    reload_affected_pmds(dp);
 								    /* Step 3: Reconfigure ports. */
 								    /* We only reconfigure the ports that we determined above, because they're
 								     * not being used by any pmd thread at the moment.  If a port fails to
 								     * reconfigure we remove it from the datapath. */
-												dpif-netdev: Fix use-after-free error in reconfigure_datapath().

Found by Coverity.

Reported-at: https://scan3.coverity.com/reports.htm#v16889/p10449/fileInstanceId=14762915&defectInstanceId=4305352&mergedDefectId=180430
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-05-26 15:50:16 -07:00
+								    struct dp_netdev_port *next_port;
 								    HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        int err;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        if (!port->need_reconfigure) {
 								            continue;
 								        }
-												dpif-netdev: Handle errors in reconfigure_pmd_threads().

Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 15:19:28 -07:00
+								        err = port_reconfigure(port);
 								        if (err) {
 								            hmap_remove(&dp->ports, &port->node);
 								            seq_change(dp->port_seq);
 								            port_destroy(port);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        } else {
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								        }
 								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
 								    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
 								     * for now, we just update the 'pmd' pointer in each rxq to point to the
 								     * wanted thread according to the scheduling policy. */
 								    /* Reset all the pmd threads to non isolated. */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        pmd->isolated = false;
 								    }
 								    /* Reset all the queues to unassigned */
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        for (int i = 0; i < port->n_rxq; i++) {
 								            port->rxqs[i].pmd = NULL;
 								        }
 								    }
 								    /* Add pinned queues and mark pmd threads isolated. */
 								    rxq_scheduling(dp, true);
 								    /* Add non-pinned queues. */
 								    rxq_scheduling(dp, false);
 								    /* Step 5: Remove queues not compliant with new scheduling. */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        struct rxq_poll *poll, *poll_next;
 								        ovs_mutex_lock(&pmd->port_mutex);
 								        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
 								            if (poll->rxq->pmd != pmd) {
 								                dp_netdev_del_rxq_from_pmd(pmd, poll);
 								            }
 								        }
 								        ovs_mutex_unlock(&pmd->port_mutex);
 								    }
 								    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
 								     * the old queues before readding them, otherwise a queue can be polled by
 								     * two threads at the same time. */
 								    reload_affected_pmds(dp);
 								    /* Step 6: Add queues from scheduling, if they're not there already. */
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (!netdev_is_pmd(port->netdev)) {
 								            continue;
 								        }
 								        for (int qid = 0; qid < port->n_rxq; qid++) {
 								            struct dp_netdev_rxq *q = &port->rxqs[qid];
 								            if (q->pmd) {
 								                ovs_mutex_lock(&q->pmd->port_mutex);
 								                dp_netdev_add_rxq_to_pmd(q->pmd, q);
 								                ovs_mutex_unlock(&q->pmd->port_mutex);
 								            }
 								        }
 								    }
 								    /* Add every port to the tx cache of every pmd thread, if it's not
 								     * there already and if this pmd has at least one rxq to poll. */
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        ovs_mutex_lock(&pmd->port_mutex);
 								        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
 								            HMAP_FOR_EACH (port, node, &dp->ports) {
 								                dp_netdev_add_port_tx_to_pmd(pmd, port);
 								            }
 								        }
 								        ovs_mutex_unlock(&pmd->port_mutex);
 								    }
 								    /* Reload affected pmd threads. */
 								    reload_affected_pmds(dp);
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
 								    /* Check if PMD Auto LB is to be enabled */
 								    set_pmd_auto_lb(dp);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								}
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
 								static bool
 								ports_require_restart(const struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (netdev_is_reconf_required(port->netdev)) {
 								            return true;
 								        }
 								    }
 								    return false;
 								}
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								/* Calculates variance in the values stored in array 'a'. 'n' is the number
 								 * of elements in array to be considered for calculating vairance.
 								 * Usage example: data array 'a' contains the processing load of each pmd and
 								 * 'n' is the number of PMDs. It returns the variance in processing load of
 								 * PMDs*/
 								static uint64_t
 								variance(uint64_t a[], int n)
 								{
 								    /* Compute mean (average of elements). */
 								    uint64_t sum = 0;
 								    uint64_t mean = 0;
 								    uint64_t sqDiff = 0;
 								    if (!n) {
 								        return 0;
 								    }
 								    for (int i = 0; i < n; i++) {
 								        sum += a[i];
 								    }
 								    if (sum) {
 								        mean = sum / n;
 								        /* Compute sum squared differences with mean. */
 								        for (int i = 0; i < n; i++) {
 								            sqDiff += (a[i] - mean)*(a[i] - mean);
 								        }
 								    }
 								    return (sqDiff ? (sqDiff / n) : 0);
 								}
 								/* Returns the variance in the PMDs usage as part of dry run of rxqs
 								 * assignment to PMDs. */
 								static bool
 								get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
 								                     uint32_t num_pmds, uint64_t *predicted_variance)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev_pmd_thread *pmd;
 								    struct dp_netdev_rxq **rxqs = NULL;
 								    struct rr_numa *numa = NULL;
 								    struct rr_numa_list rr;
 								    int n_rxqs = 0;
 								    bool ret = false;
 								    uint64_t *pmd_usage;
 								    if (!predicted_variance) {
 								        return ret;
 								    }
 								    pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
 								        if (!netdev_is_pmd(port->netdev)) {
 								            continue;
 								        }
 								        for (int qid = 0; qid < port->n_rxq; qid++) {
 								            struct dp_netdev_rxq *q = &port->rxqs[qid];
 								            uint64_t cycle_hist = 0;
 								            if (q->pmd->isolated) {
 								                continue;
 								            }
 								            if (n_rxqs == 0) {
 								                rxqs = xmalloc(sizeof *rxqs);
 								            } else {
 								                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
 								            }
 								            /* Sum the queue intervals and store the cycle history. */
 								            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
 								                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
 								            }
 								            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
 								                                         cycle_hist);
 								            /* Store the queue. */
 								            rxqs[n_rxqs++] = q;
 								        }
 								    }
 								    if (n_rxqs > 1) {
 								        /* Sort the queues in order of the processing cycles
 								         * they consumed during their last pmd interval. */
 								        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
 								    }
 								    rr_numa_list_populate(dp, &rr);
 								    for (int i = 0; i < n_rxqs; i++) {
 								        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
 								        numa = rr_numa_list_lookup(&rr, numa_id);
 								        if (!numa) {
 								            /* Abort if cross NUMA polling. */
 								            VLOG_DBG("PMD auto lb dry run."
 								                     " Aborting due to cross-numa polling.");
 								            goto cleanup;
 								        }
 								        pmd = rr_numa_get_pmd(numa, true);
 								        VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
 								                  "to be assigned port \'%s\' rx queue %d "
 								                  "(measured processing cycles %"PRIu64").",
 								                  pmd->core_id, numa_id,
 								                  netdev_rxq_get_name(rxqs[i]->rx),
 								                  netdev_rxq_get_queue_id(rxqs[i]->rx),
 								                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
 								        for (int id = 0; id < num_pmds; id++) {
 								            if (pmd->core_id == core_list[id]) {
 								                /* Add the processing cycles of rxq to pmd polling it. */
 								                pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
 								                                        RXQ_CYCLES_PROC_HIST);
 								            }
 								        }
 								    }
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        uint64_t total_cycles = 0;
 								        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
 								            continue;
 								        }
 								        /* Get the total pmd cycles for an interval. */
 								        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
 								        /* Estimate the cycles to cover all intervals. */
 								        total_cycles *= PMD_RXQ_INTERVAL_MAX;
 								        for (int id = 0; id < num_pmds; id++) {
 								            if (pmd->core_id == core_list[id]) {
 								                if (pmd_usage[id]) {
 								                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
 								                }
 								                VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
 								                         "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
 								            }
 								        }
 								    }
 								    *predicted_variance = variance(pmd_usage, num_pmds);
 								    ret = true;
 								cleanup:
 								    rr_numa_list_destroy(&rr);
 								    free(rxqs);
 								    free(pmd_usage);
 								    return ret;
 								}
 								/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
 								 * better distribution of load on PMDs. */
 								static bool
 								pmd_rebalance_dry_run(struct dp_netdev *dp)
 								    OVS_REQUIRES(dp->port_mutex)
 								{
 								    struct dp_netdev_pmd_thread *pmd;
 								    uint64_t *curr_pmd_usage;
 								    uint64_t curr_variance;
 								    uint64_t new_variance;
 								    uint64_t improvement = 0;
 								    uint32_t num_pmds;
 								    uint32_t *pmd_corelist;
 								    struct rxq_poll *poll, *poll_next;
 								    bool ret;
 								    num_pmds = cmap_count(&dp->poll_threads);
 								    if (num_pmds > 1) {
 								        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
 								        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
 								    } else {
 								        return false;
 								    }
 								    num_pmds = 0;
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								        uint64_t total_cycles = 0;
 								        uint64_t total_proc = 0;
 								        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
 								            continue;
 								        }
 								        /* Get the total pmd cycles for an interval. */
 								        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
 								        /* Estimate the cycles to cover all intervals. */
 								        total_cycles *= PMD_RXQ_INTERVAL_MAX;
 								        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
 								            uint64_t proc_cycles = 0;
 								            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
 								                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
 								            }
 								            total_proc += proc_cycles;
 								        }
 								        if (total_proc) {
 								            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
 								        }
 								        VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
 								                  pmd->core_id, curr_pmd_usage[num_pmds]);
 								        if (atomic_count_get(&pmd->pmd_overloaded)) {
 								            atomic_count_set(&pmd->pmd_overloaded, 0);
 								        }
 								        pmd_corelist[num_pmds] = pmd->core_id;
 								        num_pmds++;
 								    }
 								    curr_variance = variance(curr_pmd_usage, num_pmds);
 								    ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
 								    if (ret) {
 								        VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
 								                  " Predicted PMD variance: %"PRIu64"",
 								                  curr_variance, new_variance);
 								        if (new_variance < curr_variance) {
 								            improvement =
 								                ((curr_variance - new_variance) * 100) / curr_variance;
 								        }
 								        if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
 								            ret = false;
 								        }
 								    }
 								    free(curr_pmd_usage);
 								    free(pmd_corelist);
 								    return ret;
 								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								/* Return true if needs to revalidate datapath flows. */
 								static bool
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								dpif_netdev_run(struct dpif *dpif)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								    struct dp_netdev_pmd_thread *non_pmd;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    uint64_t new_tnl_seq;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    bool need_to_flush = true;
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    bool pmd_rebalance = false;
 								    long long int now = time_msec();
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
 								    if (non_pmd) {
 								        ovs_mutex_lock(&dp->non_pmd_mutex);
 								        HMAP_FOR_EACH (port, node, &dp->ports) {
 								            if (!netdev_is_pmd(port->netdev)) {
 								                int i;
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								                if (port->emc_enabled) {
 								                    atomic_read_relaxed(&dp->emc_insert_min,
 								                                        &non_pmd->ctx.emc_insert_min);
 								                } else {
 								                    non_pmd->ctx.emc_insert_min = 0;
 								                }
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								                for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								                    if (dp_netdev_process_rxq_port(non_pmd,
 								                                                   &port->rxqs[i],
 								                                                   port->port_no)) {
 								                        need_to_flush = false;
 								                    }
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								                }
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								            }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								        if (need_to_flush) {
 								            /* We didn't receive anything in the process loop.
 								             * Check if we need to send something.
 								             * There was no time updates on current iteration. */
 								            pmd_thread_ctx_time_update(non_pmd);
 								            dp_netdev_pmd_flush_output_packets(non_pmd, false);
 								        }
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								        dp_netdev_pmd_unref(non_pmd);
 								    }
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
 								    if (pmd_alb->is_enabled) {
 								        if (!pmd_alb->rebalance_poll_timer) {
 								            pmd_alb->rebalance_poll_timer = now;
 								        } else if ((pmd_alb->rebalance_poll_timer +
 								                   pmd_alb->rebalance_intvl) < now) {
 								            pmd_alb->rebalance_poll_timer = now;
 								            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 								                if (atomic_count_get(&pmd->pmd_overloaded) >=
 								                                    PMD_RXQ_INTERVAL_MAX) {
 								                    pmd_rebalance = true;
 								                    break;
 								                }
 								            }
 								            if (pmd_rebalance &&
 								                !dp_netdev_is_reconf_required(dp) &&
 								                !ports_require_restart(dp) &&
 								                pmd_rebalance_dry_run(dp)) {
 								                VLOG_INFO("PMD auto lb dry run."
 								                          " requesting datapath reconfigure.");
 								                dp_netdev_request_reconfigure(dp);
 								            }
 								        }
 								    }
-												dpif-netdev: Add reconfiguration request to dp_netdev.

Next patches will add new conditions when reconfiguration will be
required. It'll be simpler to have common way to request reconfiguration.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:43 +03:00
+								    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        reconfigure_datapath(dp);
-												dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-23 15:33:43 -08:00
+								    }
 								    ovs_mutex_unlock(&dp->port_mutex);
-												tnl-arp-cache: Rename module and functions to tnl-neigh-cache.

Since we don't distinguish between IPv4 and IPv6 lookups, consolidate ARP
and ND cache into neighbor cache. Other references to ARP related to the
ARP cache but that are not really about ARP have been renamed as well.
tnl_arp_lookup is kept for lookups using IPv4 instead of IPv4-mapped
addresses, but that is going to be removed in a later patch.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-11-30 16:24:49 -02:00
+								    tnl_neigh_cache_run();
-												tnl-ports: Add destination IP and MAC address to the match.

Currently tnl-port table wildcard destination ip and mac addresses
for given tunnel packet.  That could result accepting tunnel
packets destined for other hosts.  Following patch adds
support for matching for ip and mac address.
IP address upates to tnl-port table are piggybacked on
ovs-router updates.

Reported-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-09-03 00:42:34 -07:00
+								    tnl_port_map_run();
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    new_tnl_seq = seq_read(tnl_conf_seq);
 								    if (dp->last_tnl_conf_seq != new_tnl_seq) {
 								        dp->last_tnl_conf_seq = new_tnl_seq;
 								        return true;
 								    }
 								    return false;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
 								static void
 								dpif_netdev_wait(struct dpif *dpif)
 								{
 								    struct dp_netdev_port *port;
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
 								    HMAP_FOR_EACH (port, node, &dp->ports) {
-												netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-02-26 15:58:24 -08:00
+								        netdev_wait_reconf_required(port->netdev);
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								        if (!netdev_is_pmd(port->netdev)) {
 								            int i;
-												dpif-netdev: Keep count of elements in port->rxq[].

This will ease deleting a port with no open rxqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

											
										
										
											2016-02-25 14:25:33 -08:00
+								            for (i = 0; i < port->n_rxq; i++) {
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                netdev_rxq_wait(port->rxqs[i].rx);
-												netdev: Add support multiqueue recv.

new netdev type like DPDK can support multi-queue IO. Following
patch Adds support for same.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 20:52:06 -07:00
+								            }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
 								    }
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp_netdev_mutex);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void
 								pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								{
 								    struct tx_port *tx_port_cached;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    /* Flush all the queued packets. */
 								    dp_netdev_pmd_flush_output_packets(pmd, true);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    /* Free all used tx queue ids. */
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    dpif_netdev_xps_revalidate_pmd(pmd, true);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
 								        free(tx_port_cached);
 								    }
 								    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        free(tx_port_cached);
 								    }
 								}
 								/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
-												dpif-netdev: Fix comments for pmd_load_cached_ports.

Commit 57eebbb4c315 replaces thread local 'pmd->port_cache' with
'pmd->tnl_port_cache' and 'pmd->send_port_cache' maps. Update the
comments accordingly.

Fixes: 57eebbb4c315 ("Don't try to output on a device without txqs")
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:19:59 -07:00
+								 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
 								 * device, otherwise to 'pmd->send_port_cache' if the port has at least
 								 * one txq. */
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static void
 								pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 								    OVS_REQUIRES(pmd->port_mutex)
 								{
 								    struct tx_port *tx_port, *tx_port_cached;
 								    pmd_free_cached_ports(pmd);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_shrink(&pmd->send_port_cache);
 								    hmap_shrink(&pmd->tnl_port_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
 								            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
 								            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
 								                        hash_port_no(tx_port_cached->port->port_no));
 								        }
 								        if (netdev_n_txq(tx_port->port->netdev)) {
 								            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
 								            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
 								                        hash_port_no(tx_port_cached->port->port_no));
 								        }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    }
 								}
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								static void
 								pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
 								{
 								    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
 								    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
 								        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
 								                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
 								    }
 								    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
 								    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
 								             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
 								}
 								static void
 								pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
 								{
 								    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
 								    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
 								    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
 								}
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								static int
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								                          struct polled_queue **ppoll_list)
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								{
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct polled_queue *poll_list = *ppoll_list;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    struct rxq_poll *poll;
 								    int i;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
 								                                    * sizeof *poll_list);
-												dpif-netdev: Introduce port_try_ref() to prevent a race.

When pmd thread interates through all ports for queue loading,
the main thread may unreference and 'rcu-free' a port before
pmd thread take new reference of it.  This could cause pmd
thread fail the reference and access freed memory later.

This commit fixes this race by introducing port_try_ref()
which uses ovs_refcount_try_ref_rcu().  And the pmd thread
will only load the port's queue, if port_try_ref() returns
true.

Found by inspection.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-21 15:54:07 -07:00
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								    i = 0;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
-												dpif-netdev: Change polled_queue to use dp_netdev_rxq.

Soon we will want to store processing cycle counts in the dp_netdev_rxq,
so use that as a basis for the polled_queue that pmd_thread_main uses.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:39:40 -07:00
+								        poll_list[i].rxq = poll->rxq;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        poll_list[i].port_no = poll->rxq->port->port_no;
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        i++;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
 								    pmd_load_cached_ports(pmd);
 								    ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
 								    *ppoll_list = poll_list;
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    return i;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								static void *
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								pmd_thread_main(void *f_)
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								{
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd = f_;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    struct pmd_perf_stats *s = &pmd->perf_stats;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    unsigned int lc = 0;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct polled_queue *poll_list;
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    bool exiting;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    int poll_cnt;
 								    int i;
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								    int process_packets = 0;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    poll_list = NULL;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
 								    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
-												ovs-numa: Introduce function to set current thread affinity.

This commit moves the code that sets the pmd threads affinity from
netdev-dpdk to ovs-numa.  There's one small part left in netdev-dpdk, to
set the lcore_id.

Now dpif-netdev will call both modules (ovs-numa and netdev-dpdk) when
starting a pmd thread.

This change will allow having a dummy implementation of the set affinity
call, for testing purposes.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-06-06 17:05:49 -07:00
+								    ovs_numa_thread_setaffinity_core(pmd->core_id);
 								    dpdk_set_lcore_id(pmd->core_id);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    dfc_cache_init(&pmd->flow_cache);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								reload:
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    pmd_alloc_static_tx_qid(pmd);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    atomic_count_init(&pmd->pmd_overloaded, 0);
-												dpif-netdev: log port/core affinity

When using multiple PMDs and numerous ports, a performance gain
may be achieved in some use cases by pinning a PMD/port to a
particular (set of) core(s).

This patch provides a summary of the switch's port/core affinities
each time that the status of the switch's ports is modified.
Based on this information, a user may determine what affinity
modifications are required to optimise performance for their
particular use case.

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Wojciech Andralojc <wojciechx.andralojc@intel.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-06-09 07:49:18 -07:00
+								    /* List port/core affinity */
 								    for (i = 0; i < poll_cnt; i++) {
-												dpif-netdev: Add dpif-netdev/pmd-rxq-show appctl command.

This command can be used to check the port/rxq assignment to
pmd threads. For each pmd thread of the datapath shows list
of queue-ids with port names.

Additionally log message from pmd_thread_main() extended with
queue-id, and type of this message changed from INFO to DBG.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 10:38:47 +03:00
+								       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
-												dpif-netdev: Change polled_queue to use dp_netdev_rxq.

Soon we will want to store processing cycle counts in the dp_netdev_rxq,
so use that as a basis for the polled_queue that pmd_thread_main uses.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:39:40 -07:00
+								                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
 								                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
-												dpif-netdev: Reset the rxq current cycle counter on reload.

An rxq may have processing cycles counted in the current
counter when a reload happens. That could temporarily create
a small skew on the stats for an rxq. Reset the counter after
reload.

Fixes: 4809891b2e01 ("dpif-netdev: Count the rxq processing cycles for an rxq.")
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-11 14:25:33 +00:00
+								       /* Reset the rxq current cycles counter. */
 								       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
-												dpif-netdev: log port/core affinity

When using multiple PMDs and numerous ports, a performance gain
may be achieved in some use cases by pinning a PMD/port to a
particular (set of) core(s).

This patch provides a summary of the switch's port/core affinities
each time that the status of the switch's ports is modified.
Based on this information, a user may determine what affinity
modifications are required to optimise performance for their
particular use case.

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Wojciech Andralojc <wojciechx.andralojc@intel.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-06-09 07:49:18 -07:00
+								    }
-												dpif-netdev: Block pmd threads if there are no ports.

There's no reason for a pmd thread to perform its main loop if there are
no queues in its poll_list.

This commit introduces a seq object on which the pmd thread can be
blocked, if there are no queues.

When the main thread wants to reload a pmd threads it must now change
the seq object (in case it's blocked) and set 'reload' to true.

This is useful to avoid wasting CPU cycles and is also necessary for a
future commit.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    if (!poll_cnt) {
 								        while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
 								            seq_wait(pmd->reload_seq, pmd->last_reload_seq);
 								            poll_block();
 								        }
 								        lc = UINT_MAX;
 								    }
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								    pmd->intrvl_tsc_prev = 0;
 								    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								    cycles_counter_update(s);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    /* Protect pmd stats from external clearing while polling. */
 								    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    for (;;) {
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        uint64_t rx_packets = 0, tx_packets = 0;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								        pmd_perf_start_iteration(s);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        for (i = 0; i < poll_cnt; i++) {
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
 								            if (poll_list[i].emc_enabled) {
 								                atomic_read_relaxed(&pmd->dp->emc_insert_min,
 								                                    &pmd->ctx.emc_insert_min);
 								            } else {
 								                pmd->ctx.emc_insert_min = 0;
 								            }
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								            process_packets =
-												dpif-netdev: Refactor cycle counting

Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replace by a nestable cycle_timer with with start and stop functions to
embrace a code segment to be timed. The timed code may contain arbitrary
nested cycle_timers. The duration of nested timers is excluded from the
outer timer.

The caller must ensure that each call to cycle_timer_start() is
followed by a call to cycle_timer_end(). Failure to do so will lead to
assertion failure or a memory leak.

The new cycle_timer is used to measure the processing cycles per rx queue.
This is not yet strictly necessary but will be made use of in a subsequent
commit.

All cycle count functions and data are relocated to module
dpif-netdev-perf.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:24 +01:00
+								                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
-												dpif-netdev: Change definitions of 'idle' & 'processing' cycles

Instead of counting all polling cycles as processing cycles, only count
the cycles where packets were received from the polling.

Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Tested-by: Ian Stokes <ian.stokes@intel.com>
Acked-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-02-20 12:53:00 +00:00
+								                                           poll_list[i].port_no);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								            rx_packets += process_packets;
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        }
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        if (!rx_packets) {
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								            /* We didn't receive anything in the process loop.
 								             * Check if we need to send something.
 								             * There was no time updates on current iteration. */
 								            pmd_thread_ctx_time_update(pmd);
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								            tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								        }
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        if (lc++ > 1024) {
-												dpif-netdev: Use a boolean instead of pmd->port_seq.

There's no need for a sequence number, since the main thread has to wait
for the pmd thread, so there's no chance that an update will be
undetected.

A seq object will be introduced for another purpose in the next commit,
and changing this to boolean makes the code more readable.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            bool reload;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								            lc = 0;
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
-												coverage: Add coverage_try_clear() for performance-critical threads.

For performance-critical threads like pmd threads, we currently make them
never call coverage_clear() to avoid contention over the global mutex
'coverage_mutex'.  So, even though pmd thread still keeps updating their
thread-local coverage count, the count is never attributed to the global
total.  But it is useful to have them available.

This commit makes this happen by implementing a non-contending version
of the clear function, coverage_try_clear().  The function will use
the ovs_mutex_trylock() and return immediately if the mutex cannot
be acquired.  Since threads like pmd thread are always busy-looping,
the lock will eventually be acquired.

Requested-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com

											
										
										
											2015-08-13 11:31:49 -07:00
+								            coverage_try_clear();
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
-												dpif-netdev: Remove PMD latency on seq_mutex

The PMD thread needs to keep processing RX queues in order
to achieve maximum throughput. It also needs to sweep emc
cache and quiesce which use seq_mutex. That mutex can
eventually block the PMD thread causing latency spikes and
affecting the throughput.

Since there is no requirement for running those tasks at a
specific time, this patch extend seq API to allow tentative
locking instead.

Reported-by: Karl Rister <krister@redhat.com>
Co-authored-by: Karl Rister <krister@redhat.com>
Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-05 10:33:38 -03:00
+								            if (!ovsrcu_try_quiesce()) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
-												dpif-netdev: Remove PMD latency on seq_mutex

The PMD thread needs to keep processing RX queues in order
to achieve maximum throughput. It also needs to sweep emc
cache and quiesce which use seq_mutex. That mutex can
eventually block the PMD thread causing latency spikes and
affecting the throughput.

Since there is no requirement for running those tasks at a
specific time, this patch extend seq API to allow tentative
locking instead.

Reported-by: Karl Rister <krister@redhat.com>
Co-authored-by: Karl Rister <krister@redhat.com>
Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-05 10:33:38 -03:00
+								            }
-												lib/dpif-netdev: Clean-up pmd thread signaling.

It could be possible that the thread misses a signal when it reads the
change_seq again after reload.  Also, the counter has no dependent
data, so the memory model for the atomic read can be relaxed.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
											
										
										
											2014-08-15 15:09:38 -07:00
-												dpif-netdev: Use a boolean instead of pmd->port_seq.

There's no need for a sequence number, since the main thread has to wait
for the pmd thread, so there's no chance that an update will be
undetected.

A seq object will be introduced for another purpose in the next commit,
and changing this to boolean makes the code more readable.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            atomic_read_relaxed(&pmd->reload, &reload);
 								            if (reload) {
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								                break;
 								            }
 								        }
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								        pmd_perf_end_iteration(s, rx_packets, tx_packets,
 								                               pmd_perf_metrics_enabled(pmd));
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    }
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    exiting = latch_is_set(&pmd->exit_latch);
 								    /* Signal here to make sure the pmd finishes
 								     * reloading the updated configuration. */
 								    dp_netdev_pmd_reload_done(pmd);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								    pmd_free_static_tx_qid(pmd);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Fix race condition in pmd thread initialization.

The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:02:14 -07:00
+								    if (!exiting) {
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								        goto reload;
 								    }
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    dfc_cache_uninit(&pmd->flow_cache);
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    free(poll_list);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    pmd_free_cached_ports(pmd);
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    return NULL;
 								}
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void
 								dp_netdev_disable_upcall(struct dp_netdev *dp)
 								    OVS_ACQUIRES(dp->upcall_rwlock)
 								{
 								    fat_rwlock_wrlock(&dp->upcall_rwlock);
 								}
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
 								/* Meters */
 								static void
 								dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
 								                               struct ofputil_meter_features *features)
 								{
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    features->max_meters = MAX_METERS;
 								    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
 								    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
 								    features->max_bands = MAX_BANDS;
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								    features->max_color = 0;
 								}
-												dpif-netdev: Fix a couple of comments for dp_netdev_run_meter().

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-06-18 12:04:50 -07:00
+								/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
 								 * that exceed a band are dropped in-place. */
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								static void
 								dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
 								                    uint32_t meter_id, long long int now)
 								{
 								    struct dp_meter *meter;
 								    struct dp_meter_band *band;
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in dp_netdev_run_meter.

Use DP_PACKET_BATCH_FOR_EACH macro in dp_netdev_run_meter().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:07:53 -07:00
+								    struct dp_packet *packet;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    long long int long_delta_t; /* msec */
 								    uint32_t delta_t; /* msec */
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in dp_netdev_run_meter.

Use DP_PACKET_BATCH_FOR_EACH macro in dp_netdev_run_meter().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:07:53 -07:00
+								    const size_t cnt = dp_packet_batch_size(packets_);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    uint32_t bytes, volume;
 								    int exceeded_band[NETDEV_MAX_BURST];
 								    uint32_t exceeded_rate[NETDEV_MAX_BURST];
 								    int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
 								    if (meter_id >= MAX_METERS) {
 								        return;
 								    }
 								    meter_lock(dp, meter_id);
 								    meter = dp->meters[meter_id];
 								    if (!meter) {
 								        goto out;
 								    }
 								    /* Initialize as negative values. */
 								    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
 								    /* Initialize as zeroes. */
 								    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
 								    /* All packets will hit the meter at the same time. */
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								    long_delta_t = (now - meter->used) / 1000; /* msec */
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
 								    /* Make sure delta_t will not be too large, so that bucket will not
 								     * wrap around below. */
 								    delta_t = (long_delta_t > (long long int)meter->max_delta_t)
 								        ? meter->max_delta_t : (uint32_t)long_delta_t;
 								    /* Update meter stats. */
 								    meter->used = now;
 								    meter->packet_count += cnt;
 								    bytes = 0;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in dp_netdev_run_meter.

Use DP_PACKET_BATCH_FOR_EACH macro in dp_netdev_run_meter().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:07:53 -07:00
+								        bytes += dp_packet_size(packet);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    }
 								    meter->byte_count += bytes;
 								    /* Meters can operate in terms of packets per second or kilobits per
 								     * second. */
 								    if (meter->flags & OFPMF13_PKTPS) {
 								        /* Rate in packets/second, bucket 1/1000 packets. */
 								        /* msec * packets/sec = 1/1000 packets. */
 								        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
 								    } else {
 								        /* Rate in kbps, bucket in bits. */
 								        /* msec * kbps = bits */
 								        volume = bytes * 8;
 								    }
 								    /* Update all bands and find the one hit with the highest rate for each
 								     * packet (if any). */
 								    for (int m = 0; m < meter->n_bands; ++m) {
 								        band = &meter->bands[m];
 								        /* Update band's bucket. */
 								        band->bucket += delta_t * band->up.rate;
 								        if (band->bucket > band->up.burst_size) {
 								            band->bucket = band->up.burst_size;
 								        }
 								        /* Drain the bucket for all the packets, if possible. */
 								        if (band->bucket >= volume) {
 								            band->bucket -= volume;
 								        } else {
 								            int band_exceeded_pkt;
 								            /* Band limit hit, must process packet-by-packet. */
 								            if (meter->flags & OFPMF13_PKTPS) {
 								                band_exceeded_pkt = band->bucket / 1000;
 								                band->bucket %= 1000; /* Remainder stays in bucket. */
 								                /* Update the exceeding band for each exceeding packet.
 								                 * (Only one band will be fired by a packet, and that
 								                 * can be different for each packet.) */
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								                for (int i = band_exceeded_pkt; i < cnt; i++) {
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								                    if (band->up.rate > exceeded_rate[i]) {
 								                        exceeded_rate[i] = band->up.rate;
 								                        exceeded_band[i] = m;
 								                    }
 								                }
 								            } else {
 								                /* Packet sizes differ, must process one-by-one. */
 								                band_exceeded_pkt = cnt;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in dp_netdev_run_meter.

Use DP_PACKET_BATCH_FOR_EACH macro in dp_netdev_run_meter().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:07:53 -07:00
+								                    uint32_t bits = dp_packet_size(packet) * 8;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
 								                    if (band->bucket >= bits) {
 								                        band->bucket -= bits;
 								                    } else {
 								                        if (i < band_exceeded_pkt) {
 								                            band_exceeded_pkt = i;
 								                        }
 								                        /* Update the exceeding band for the exceeding packet.
 								                         * (Only one band will be fired by a packet, and that
 								                         * can be different for each packet.) */
 								                        if (band->up.rate > exceeded_rate[i]) {
 								                            exceeded_rate[i] = band->up.rate;
 								                            exceeded_band[i] = m;
 								                        }
 								                    }
 								                }
 								            }
 								            /* Remember the first exceeding packet. */
 								            if (exceeded_pkt > band_exceeded_pkt) {
 								                exceeded_pkt = band_exceeded_pkt;
 								            }
 								        }
 								    }
-												dpif-netdev: Fix a couple of comments for dp_netdev_run_meter().

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-06-18 12:04:50 -07:00
+								    /* Fire the highest rate band exceeded by each packet, and drop
 								     * packets if needed. */
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    size_t j;
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in dp_netdev_run_meter.

Use DP_PACKET_BATCH_FOR_EACH macro in dp_netdev_run_meter().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:07:53 -07:00
+								    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								        if (exceeded_band[j] >= 0) {
 								            /* Meter drop packet. */
 								            band = &meter->bands[exceeded_band[j]];
 								            band->packet_count += 1;
 								            band->byte_count += dp_packet_size(packet);
 								            dp_packet_delete(packet);
 								        } else {
 								            /* Meter accepts packet. */
 								            dp_packet_batch_refill(packets_, packet, j);
 								        }
 								    }
 								 out:
 								    meter_unlock(dp, meter_id);
 								}
 								/* Meter set/get/del processing is still single-threaded. */
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								static int
-												dpif: Don't pass in '*meter_id' to meter_set commands.

The original intent of the API appears to be that the underlying DPIF
implementaion would choose a local meter id.  However, neither of the
existing datapath meter implementations (userspace or Linux) implemented
that; they expected a valid meter id to be passed in, otherwise they
returned an error.  This commit follows the existing implementations and
makes the API somewhat cleaner.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-07 19:51:26 -07:00
+								dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								                      struct ofputil_meter_config *config)
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								{
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif: Don't pass in '*meter_id' to meter_set commands.

The original intent of the API appears to be that the underlying DPIF
implementaion would choose a local meter id.  However, neither of the
existing datapath meter implementations (userspace or Linux) implemented
that; they expected a valid meter id to be passed in, otherwise they
returned an error.  This commit follows the existing implementations and
makes the API somewhat cleaner.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-07 19:51:26 -07:00
+								    uint32_t mid = meter_id.uint32;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    struct dp_meter *meter;
 								    int i;
 								    if (mid >= MAX_METERS) {
 								        return EFBIG; /* Meter_id out of range. */
 								    }
-												dpif: Move common meter checks into the dpif layer.

Another dpif provider will soon add support for meters, so move
some of the common sanity checks up into the dpif layer so that each
provider doesn't need to re-implement them.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-07-24 21:07:04 -07:00
+								    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								        return EBADF; /* Unsupported flags set */
 								    }
-												dpif-netdev: Fix a zero-rate bug for meter

Open vSwitch daemon crashes (by receiving signal SIGFPE,
Arithmetic exception) when a controller tries to send
a meter-mod message with zero rate.

Signed-off-by: Ali Volkan ATLI <volkan.atli@argela.com.tr>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-09-27 18:33:57 +03:00
-												dpif: Move common meter checks into the dpif layer.

Another dpif provider will soon add support for meters, so move
some of the common sanity checks up into the dpif layer so that each
provider doesn't need to re-implement them.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-07-24 21:07:04 -07:00
+								    if (config->n_bands > MAX_BANDS) {
 								        return EINVAL;
-												dpif-netdev: Fix a zero-rate bug for meter

Open vSwitch daemon crashes (by receiving signal SIGFPE,
Arithmetic exception) when a controller tries to send
a meter-mod message with zero rate.

Signed-off-by: Ali Volkan ATLI <volkan.atli@argela.com.tr>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-09-27 18:33:57 +03:00
+								    }
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    for (i = 0; i < config->n_bands; ++i) {
 								        switch (config->bands[i].type) {
 								        case OFPMBT13_DROP:
 								            break;
 								        default:
 								            return ENODEV; /* Unsupported band type */
 								        }
 								    }
 								    /* Allocate meter */
 								    meter = xzalloc(sizeof *meter
 								                    + config->n_bands * sizeof(struct dp_meter_band));
-												dpif-netdev: Don't check if xcalloc() failed when creating meter.

xcalloc() can't return null.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-28 17:28:28 -07:00
+								    meter->flags = config->flags;
 								    meter->n_bands = config->n_bands;
 								    meter->max_delta_t = 0;
 								    meter->used = time_usec();
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
-												dpif-netdev: Don't check if xcalloc() failed when creating meter.

xcalloc() can't return null.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-28 17:28:28 -07:00
+								    /* set up bands */
 								    for (i = 0; i < config->n_bands; ++i) {
 								        uint32_t band_max_delta_t;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
-												dpif-netdev: Don't check if xcalloc() failed when creating meter.

xcalloc() can't return null.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-28 17:28:28 -07:00
+								        /* Set burst size to a workable value if none specified. */
 								        if (config->bands[i].burst_size == 0) {
 								            config->bands[i].burst_size = config->bands[i].rate;
 								        }
 								        meter->bands[i].up = config->bands[i];
 								        /* Convert burst size to the bucket units: */
 								        /* pkts => 1/1000 packets, kilobits => bits. */
 								        meter->bands[i].up.burst_size *= 1000;
 								        /* Initialize bucket to empty. */
 								        meter->bands[i].bucket = 0;
 								        /* Figure out max delta_t that is enough to fill any bucket. */
 								        band_max_delta_t
 								            = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
 								        if (band_max_delta_t > meter->max_delta_t) {
 								            meter->max_delta_t = band_max_delta_t;
 								        }
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    }
-												dpif-netdev: Don't check if xcalloc() failed when creating meter.

xcalloc() can't return null.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-08-28 17:28:28 -07:00
 								    meter_lock(dp, mid);
 								    dp_delete_meter(dp, mid); /* Free existing meter, if any */
 								    dp->meters[mid] = meter;
 								    meter_unlock(dp, mid);
 								    return 0;
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								}
 								static int
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								dpif_netdev_meter_get(const struct dpif *dpif,
 								                      ofproto_meter_id meter_id_,
 								                      struct ofputil_meter_stats *stats, uint16_t n_bands)
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								{
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    const struct dp_netdev *dp = get_dp_netdev(dpif);
 								    uint32_t meter_id = meter_id_.uint32;
-												dpif-netdev: Prevent unsafe access when retrieving meter stats.

dpif_netdev_meter_get() retrieved a pointer to a meter entry without
holding a lock.  It's possible that another thread could have deleted
that entry between retrieving the pointer and dereferencing the pointer.
This makes the function hold the lock the entire time the meter entry is
needed.

Found by inspection.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2018-08-28 17:38:25 -07:00
+								    int retval = 0;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
 								    if (meter_id >= MAX_METERS) {
 								        return EFBIG;
 								    }
-												dpif-netdev: Prevent unsafe access when retrieving meter stats.

dpif_netdev_meter_get() retrieved a pointer to a meter entry without
holding a lock.  It's possible that another thread could have deleted
that entry between retrieving the pointer and dereferencing the pointer.
This makes the function hold the lock the entire time the meter entry is
needed.

Found by inspection.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2018-08-28 17:38:25 -07:00
 								    meter_lock(dp, meter_id);
 								    const struct dp_meter *meter = dp->meters[meter_id];
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    if (!meter) {
-												dpif-netdev: Prevent unsafe access when retrieving meter stats.

dpif_netdev_meter_get() retrieved a pointer to a meter entry without
holding a lock.  It's possible that another thread could have deleted
that entry between retrieving the pointer and dereferencing the pointer.
This makes the function hold the lock the entire time the meter entry is
needed.

Found by inspection.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2018-08-28 17:38:25 -07:00
+								        retval = ENOENT;
 								        goto done;
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    }
 								    if (stats) {
 								        int i = 0;
 								        stats->packet_in_count = meter->packet_count;
 								        stats->byte_in_count = meter->byte_count;
 								        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
 								            stats->bands[i].packet_count = meter->bands[i].packet_count;
 								            stats->bands[i].byte_count = meter->bands[i].byte_count;
 								        }
 								        stats->n_bands = i;
 								    }
-												dpif-netdev: Prevent unsafe access when retrieving meter stats.

dpif_netdev_meter_get() retrieved a pointer to a meter entry without
holding a lock.  It's possible that another thread could have deleted
that entry between retrieving the pointer and dereferencing the pointer.
This makes the function hold the lock the entire time the meter entry is
needed.

Found by inspection.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2018-08-28 17:38:25 -07:00
 								done:
 								    meter_unlock(dp, meter_id);
 								    return retval;
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								}
 								static int
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								dpif_netdev_meter_del(struct dpif *dpif,
 								                      ofproto_meter_id meter_id_,
 								                      struct ofputil_meter_stats *stats, uint16_t n_bands)
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								{
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    int error;
 								    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
 								    if (!error) {
 								        uint32_t meter_id = meter_id_.uint32;
 								        meter_lock(dp, meter_id);
 								        dp_delete_meter(dp, meter_id);
 								        meter_unlock(dp, meter_id);
 								    }
 								    return error;
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								}
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void
 								dpif_netdev_disable_upcall(struct dpif *dpif)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp_netdev_disable_upcall(dp);
 								}
 								static void
 								dp_netdev_enable_upcall(struct dp_netdev *dp)
 								    OVS_RELEASES(dp->upcall_rwlock)
 								{
 								    fat_rwlock_unlock(&dp->upcall_rwlock);
 								}
 								static void
 								dpif_netdev_enable_upcall(struct dpif *dpif)
 								    OVS_NO_THREAD_SAFETY_ANALYSIS
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp_netdev_enable_upcall(dp);
 								}
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static void
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
 								{
 								    ovs_mutex_lock(&pmd->cond_mutex);
-												dpif-netdev: Use a boolean instead of pmd->port_seq.

There's no need for a sequence number, since the main thread has to wait
for the pmd thread, so there's no chance that an update will be
undetected.

A seq object will be introduced for another purpose in the next commit,
and changing this to boolean makes the code more readable.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    atomic_store_relaxed(&pmd->reload, false);
-												dpif-netdev: Block pmd threads if there are no ports.

There's no reason for a pmd thread to perform its main loop if there are
no queues in its poll_list.

This commit introduces a seq object on which the pmd thread can be
blocked, if there are no queues.

When the main thread wants to reload a pmd threads it must now change
the seq object (in case it's blocked) and set 'reload' to true.

This is useful to avoid wasting CPU cycles and is also necessary for a
future commit.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->last_reload_seq = seq_read(pmd->reload_seq);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    xpthread_cond_signal(&pmd->cond);
 								    ovs_mutex_unlock(&pmd->cond_mutex);
 								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
-												dpif-netdev: Fix crash in dpif_netdev_execute().

dp_netdev_get_pmd() is allowed to return NULL (even if we call it with
NON_PMD_CORE_ID) for different reasons:

* Since we use RCU to protect pmd threads, it is possible that
  ovs_refcount_try_ref_rcu() has failed.
* During reconfiguration we destroy every thread.

This commit makes sure that we always handle the case when
dp_netdev_get_pmd() returns NULL without crashing (the change in
dpif_netdev_run() doesn't fix anything, because everything is happening
in the main thread, but it's better to honor the interface in case we
change our threading model).

This actually fixes a pretty serious crash that happens if
dpif_netdev_execute() is called from a non pmd thread while
reconfiguration is happening.  It can be triggered by enabling bfd
(because it's handled by the monitor thread, which is a non pmd thread)
on an interface and changing something that requires datapath
reconfiguration (n_rxq, pmd-cpu-mask, mtu).

A testcase that reproduces the race condition is included.

This is a possible backtrace of the segfault:

 #0  0x000000000060c7f1 in dp_execute_cb (aux_=0x7f1dd2d2a320,
 packets_=0x7f1dd2d2a370, a=0x7f1dd2d2a658, may_steal=false) at
 ../lib/dpif-netdev.c:4357
 #1  0x00000000006448b2 in odp_execute_actions (dp=0x7f1dd2d2a320,
 batch=0x7f1dd2d2a370, steal=false, actions=0x7f1dd2d2a658,
 actions_len=8,
     dp_execute_action=0x60c7a5 <dp_execute_cb>) at
 ../lib/odp-execute.c:538
 #2  0x000000000060d00c in dp_netdev_execute_actions (pmd=0x0,
 packets=0x7f1dd2d2a370, may_steal=false, flow=0x7f1dd2d2ae70,
 actions=0x7f1dd2d2a658, actions_len=8,
     now=44965873) at ../lib/dpif-netdev.c:4577
 #3  0x000000000060834a in dpif_netdev_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2a578) at ../lib/dpif-netdev.c:2624
 #4  0x0000000000608441 in dpif_netdev_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif-netdev.c:2654
 #5  0x0000000000610a30 in dpif_operate (dpif=0x2b67b70,
 ops=0x7f1dd2d2a5c8, n_ops=1) at ../lib/dpif.c:1268
 #6  0x000000000061098c in dpif_execute (dpif=0x2b67b70,
 execute=0x7f1dd2d2aa50) at ../lib/dpif.c:1233
 #7  0x00000000005b9008 in ofproto_dpif_execute_actions__
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, indentation=0, depth=0, resubmits=0,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:3806
 #8  0x00000000005b907a in ofproto_dpif_execute_actions
 (ofproto=0x2b69360, version=18446744073709551614, flow=0x7f1dd2d2ae70,
 rule=0x0, ofpacts=0x7f1dd2d2b100,
     ofpacts_len=16, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif.c:3823
 #9  0x00000000005dea9b in xlate_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at
 ../ofproto/ofproto-dpif-xlate.c:5792
 #10 0x00000000005bab12 in ofproto_dpif_send_packet (ofport=0x2b98380,
 oam=false, packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif.c:4628
 #11 0x00000000005c3fc8 in monitor_mport_run (mport=0x2b8cd00,
 packet=0x7f1dd2d2b5c0) at ../ofproto/ofproto-dpif-monitor.c:287
 #12 0x00000000005c3d9b in monitor_run () at
 ../ofproto/ofproto-dpif-monitor.c:227
 #13 0x00000000005c3cab in monitor_main (args=0x0) at
 ../ofproto/ofproto-dpif-monitor.c:189
 #14 0x00000000006a183a in ovsthread_wrapper (aux_=0x2b8afd0) at
 ../lib/ovs-thread.c:342
 #15 0x00007f1dd75eb444 in start_thread (arg=0x7f1dd2d2c700) at
 pthread_create.c:333
 #16 0x00007f1dd6e1d20d in clone () at
 ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-10-04 14:53:31 -07:00
+								 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
 								 * 'core_id' is NON_PMD_CORE_ID).
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								 *
 								 * Caller must unrefs the returned reference.  */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static struct dp_netdev_pmd_thread *
-												ovs-numa: Change 'core_id' to unsigned.

DPDK lcore_id is unsigned.  We need to support big values like
LCORE_ID_ANY (=UINT32_MAX).  Therefore I am changing the type everywhere
in OVS.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-05-22 17:14:19 +01:00
+								dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    struct dp_netdev_pmd_thread *pmd;
-												lib/cmap: split up cmap_find().

This makes the following patch easier and cleans up the code.

Explicit "inline" keywords seem necessary to prevent performance
regression on cmap_find() with GCC 4.7 -O2.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-09-24 10:39:20 -07:00
+								    const struct cmap_node *pnode;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
-												dpif-netdev: Add function to get pmd using core id.

This commit adds the function dp_netdev_get_pmd() which allows
users to get 'struct dp_netdev_pmd_thread' based on the core id.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-13 14:47:09 -07:00
+								    pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    if (!pnode) {
 								        return NULL;
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
 								static void
 								dp_netdev_set_nonpmd(struct dp_netdev *dp)
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    OVS_REQUIRES(dp->port_mutex)
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								{
 								    struct dp_netdev_pmd_thread *non_pmd;
 								    non_pmd = xzalloc(sizeof *non_pmd);
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
-												dpif-netdev: Allow multi-rx-queue, multi-pmd-thread configuration.

This commits adds the multithreading functionality to OVS dpdk
module.  Users are able to create multiple pmd threads and set
their cpu affinity via specifying the cpu mask string similar
to the EAL '-c COREMASK' option.

Also, the number of rx queues for each dpdk interface is made
configurable to help distribution of rx packets among multiple
pmd threads.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-08 15:22:26 -07:00
+								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								/* Caller must have valid pointer to 'pmd'. */
 								static bool
 								dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
 								{
 								    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
 								}
 								static void
 								dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
 								{
 								    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
 								        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
 								    }
 								}
 								/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
 								 * fails, keeps checking for next node until reaching the end of cmap.
 								 *
 								 * Caller must unrefs the returned reference. */
 								static struct dp_netdev_pmd_thread *
 								dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
 								{
 								    struct dp_netdev_pmd_thread *next;
 								    do {
 								        struct cmap_node *node;
 								        node = cmap_next_position(&dp->poll_threads, pos);
 								        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
 								            : NULL;
 								    } while (next && !dp_netdev_pmd_try_ref(next));
 								    return next;
 								}
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								/* Configures the 'pmd' based on the input argument. */
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
-												dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-07 12:54:10 -07:00
+								                        unsigned core_id, int numa_id)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    pmd->dp = dp;
 								    pmd->core_id = core_id;
 								    pmd->numa_id = numa_id;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->need_reload = false;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    pmd->n_output_batches = 0;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
 								    ovs_refcount_init(&pmd->ref_cnt);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    latch_init(&pmd->exit_latch);
-												dpif-netdev: Block pmd threads if there are no ports.

There's no reason for a pmd thread to perform its main loop if there are
no queues in its poll_list.

This commit introduces a seq object on which the pmd thread can be
blocked, if there are no queues.

When the main thread wants to reload a pmd threads it must now change
the seq object (in case it's blocked) and set 'reload' to true.

This is useful to avoid wasting CPU cycles and is also necessary for a
future commit.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->reload_seq = seq_create();
 								    pmd->last_reload_seq = seq_read(pmd->reload_seq);
-												dpif-netdev: Use a boolean instead of pmd->port_seq.

There's no need for a sequence number, since the main thread has to wait
for the pmd thread, so there's no chance that an update will be
undetected.

A seq object will be introduced for another purpose in the next commit,
and changing this to boolean makes the code more readable.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    atomic_init(&pmd->reload, false);
-												dpif-netdev: Allow direct destroy of 'struct dp_netdev_port'.

Before this commit, when 'struct dp_netdev_port' is deleted from
'dpif-netdev' datapath, if there is pmd thread, the pmd thread
will release the last reference to the port and ovs-rcu postpone
the destroy.  However, the delayed close of object like 'struct
netdev' could cause failure in immediate re-add or reconfigure of
the same device.

To fix the above issue, this commit uses condition variable and
makes the main thread wait for pmd thread to release the reference
when deleting port.  Then, the main thread can directly destroy the
port.

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-11-07 13:27:05 -08:00
+								    xpthread_cond_init(&pmd->cond, NULL);
 								    ovs_mutex_init(&pmd->cond_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    ovs_mutex_init(&pmd->flow_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_init(&pmd->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_init(&pmd->flow_table);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    cmap_init(&pmd->classifiers);
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								    pmd->ctx.last_rxq = NULL;
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    pmd_thread_ctx_time_update(pmd);
 								    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
 								    pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_init(&pmd->poll_list);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    hmap_init(&pmd->tx_ports);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_init(&pmd->tnl_port_cache);
 								    hmap_init(&pmd->send_port_cache);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    /* init the 'flow_cache' since there is no
 								     * actual thread created for NON_PMD_CORE_ID. */
 								    if (core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        dfc_cache_init(&pmd->flow_cache);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								        pmd_alloc_static_tx_qid(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    }
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    pmd_perf_stats_init(&pmd->perf_stats);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
 								                hash_int(core_id, 0));
 								}
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								static void
 								dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dp_netdev_pmd_flow_flush(pmd);
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_destroy(&pmd->send_port_cache);
 								    hmap_destroy(&pmd->tnl_port_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    hmap_destroy(&pmd->tx_ports);
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_destroy(&pmd->poll_list);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* All flows (including their dpcls_rules) have been deleted already */
 								    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
 								        dpcls_destroy(cls);
-												dpif-netdev: Fix memory leak.

We keep all the per-port classifiers around, since they can be reused,
but when a pmd thread is destroyed we should free them.

Found using valgrind.

Fixes: 3453b4d62a98("dpif-netdev: dpcls per in_port with sorted
subtables")

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovsrcu_postpone(free, cls);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    }
 								    cmap_destroy(&pmd->classifiers);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    cmap_destroy(&pmd->flow_table);
 								    ovs_mutex_destroy(&pmd->flow_mutex);
 								    latch_destroy(&pmd->exit_latch);
-												dpif-netdev: Block pmd threads if there are no ports.

There's no reason for a pmd thread to perform its main loop if there are
no queues in its poll_list.

This commit introduces a seq object on which the pmd thread can be
blocked, if there are no queues.

When the main thread wants to reload a pmd threads it must now change
the seq object (in case it's blocked) and set 'reload' to true.

This is useful to avoid wasting CPU cycles and is also necessary for a
future commit.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    seq_destroy(pmd->reload_seq);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    xpthread_cond_destroy(&pmd->cond);
 								    ovs_mutex_destroy(&pmd->cond_mutex);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_destroy(&pmd->port_mutex);
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    free(pmd);
 								}
 								/* Stops the pmd thread, removes it from the 'dp->poll_threads',
 								 * and unrefs the struct. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								{
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
 								     * but extra cleanup is necessary */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    if (pmd->core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Take non_pmd_mutex to access tx cached ports.

As documented in dp_netdev_pmd_thread, we must take non_pmd_mutex to
access the tx port caches for the non pmd thread.

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovs_mutex_lock(&dp->non_pmd_mutex);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        dfc_cache_uninit(&pmd->flow_cache);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        pmd_free_cached_ports(pmd);
-												dpif-netdev: Incremental addition/deletion of PMD threads.

Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 13:46:33 -07:00
+								        pmd_free_static_tx_qid(pmd);
-												dpif-netdev: Take non_pmd_mutex to access tx cached ports.

As documented in dp_netdev_pmd_thread, we must take non_pmd_mutex to
access the tx port caches for the non pmd thread.

Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        ovs_mutex_unlock(&dp->non_pmd_mutex);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    } else {
 								        latch_set(&pmd->exit_latch);
 								        dp_netdev_reload_pmd__(pmd);
 								        xpthread_join(pmd->thread, NULL);
 								    }
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    dp_netdev_pmd_clear_ports(pmd);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    /* Purges the 'pmd''s flows after stopping the thread, but before
 								     * destroying the flows, so that the flow stats can be collected. */
 								    if (dp->dp_purge_cb) {
 								        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
 								    }
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    dp_netdev_pmd_unref(pmd);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
 								 * thread. */
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								static void
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								{
 								    struct dp_netdev_pmd_thread *pmd;
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    struct dp_netdev_pmd_thread **pmd_list;
 								    size_t k = 0, n_pmds;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    n_pmds = cmap_count(&dp->poll_threads);
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
 								    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            continue;
 								        }
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
+								        /* We cannot call dp_netdev_del_pmd(), since it alters
 								         * 'dp->poll_threads' (while we're iterating it) and it
 								         * might quiesce. */
 								        ovs_assert(k < n_pmds);
 								        pmd_list[k++] = pmd;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								    }
-												dpif-netdev: Fix improper use of CMAP_FOR_EACH.

It is ok to iterate a cmap with CMAP_FOR_EACH and remove elements with
cmap_remove(), but having quiescent states inside the loop might create
problems, since some of the postponed cleanup done inside the cmap might
be executed, freeing the memory that the iterator is using.

We had several of these errors in dpif-netdev, because when we rearrange
ports or threads we often need to wait on a condition variable (which
implies a quiescent state).

This problem caused iterations to skip elements or to list them twice,
resulting in the main thread waiting on a condition without anyone else
to signal.

Fix these cases by moving the possible quiescent states outside
CMAP_FOR_EACH loops.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2016-01-26 18:53:52 -08:00
 								    for (size_t i = 0; i < k; i++) {
 								        dp_netdev_del_pmd(dp, pmd_list[i]);
 								    }
 								    free(pmd_list);
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								}
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Deletes all rx queues from pmd->poll_list and all the ports from
 								 * pmd->tx_ports. */
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								{
 								    struct rxq_poll *poll;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *port;
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    ovs_mutex_lock(&pmd->port_mutex);
-												dpif-netdev: Use hmap for poll_list in pmd threads.

A future commit will use this to determine if a queue is already
contained in a pmd thread.

To keep the behavior unaltered we now have to sort queues before
printing them in pmd_info_show_rxq().

Also this commit introduces 'struct polled_queue' that will be used
exclusively in the fast path, uses 'struct dp_netdev_rxq' from 'struct
rxq_poll' and uses 'rx' for 'netdev_rxq' and 'rxq' for 'dp_netdev_rxq'.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								        free(poll);
 								    }
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
 								        free(port);
 								    }
 								    ovs_mutex_unlock(&pmd->port_mutex);
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								static void
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                         struct dp_netdev_rxq *rxq)
 								    OVS_REQUIRES(pmd->port_mutex)
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
+								{
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    int qid = netdev_rxq_get_queue_id(rxq->rx);
 								    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
 								    struct rxq_poll *poll;
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
 								        if (poll->rxq == rxq) {
 								            /* 'rxq' is already polled by this thread. Do nothing. */
 								            return;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								        }
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								    }
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    poll = xmalloc(sizeof *poll);
 								    poll->rxq = rxq;
 								    hmap_insert(&pmd->poll_list, &poll->node, hash);
-												dpif-netdev: Add functions to modify rxq without reloading pmd threads.

This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 17:01:25 -07:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->need_reload = true;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Delete 'poll' from poll_list of PMD thread. */
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								static void
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 								                           struct rxq_poll *poll)
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    OVS_REQUIRES(pmd->port_mutex)
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								{
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_remove(&pmd->poll_list, &poll->node);
 								    free(poll);
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->need_reload = true;
-												dpif-netdev: Rework of rx queue management.

Current rx queue management model is buggy and will not work properly
without additional barriers and other syncronization between PMD
threads and main thread.

Known BUGS of current model:
	* While reloading, two PMD threads, one already reloaded and
	  one not yet reloaded, can poll same queue of the same port.
	  This behavior may lead to dpdk driver failure, because they
	  are not thread-safe.
	* Same bug as fixed in commit e4e74c3a2b
	  ("dpif-netdev: Purge all ukeys when reconfigure pmd.") but
	  reproduced while only reconfiguring of pmd threads without
	  restarting, because addition may change the sequence of
	  other ports, which is important in time of reconfiguration.

Introducing the new model, where distribution of queues made by main
thread with minimal synchronizations and without data races between
pmd threads. Also, this model should work faster, because only
needed threads will be interrupted for reconfiguraition and total
computational complexity of reconfiguration is less.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:33 +03:00
+								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
 								 * changes to take effect. */
-												dpif-netdev: Move rxq management into functions.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-02-08 18:30:30 +03:00
+								static void
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 								                             struct dp_netdev_port *port)
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    OVS_REQUIRES(pmd->port_mutex)
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								{
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    struct tx_port *tx;
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
 								    if (tx) {
 								        /* 'port' is already on this thread tx cache. Do nothing. */
 								        return;
 								    }
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    tx = xzalloc(sizeof *tx);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    tx->port = port;
 								    tx->qid = -1;
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								    tx->flush_time = 0LL;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								    dp_packet_batch_init(&tx->output_pkts);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    pmd->need_reload = true;
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								}
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
 								 * changes to take effect. */
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								static void
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 								                               struct tx_port *tx)
 								    OVS_REQUIRES(pmd->port_mutex)
-												dpif-netdev: Create pmd threads for every numa node.

A lot of the complexity in the code that handles pmd threads and ports
in dpif-netdev is due to the fact that we postpone the creation of pmd
threads on a numa node until we have a port that needs to be polled on
that particular node.

Since the previous commit, a pmd thread with no ports will not consume
any CPU, so it seems easier to create all the threads at once.

This will also make future commits easier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								{
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    hmap_remove(&pmd->tx_ports, &tx->node);
 								    free(tx);
 								    pmd->need_reload = true;
-												dpif-netdev: Use separate threads for forwarding.

For now, we use exactly two threads.  Presumably at some point we will want
to make this configurable.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-27 17:00:30 -08:00
+								}
-												bridge: Store datapath version into ovsdb

OVS userspace are backward compatible with older Linux kernel modules.
However, not having the most up-to-date datapath kernel modules can
some times lead to user confusion. Storing the datapath version in
OVSDB allows management software to check and optionally provide
notifications to users.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-16 15:23:11 -07:00
+								static char *
 								dpif_netdev_get_datapath_version(void)
 								{
 								     return xstrdup("<built-in>");
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								static void
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
-												dpif-netdev: Cache time_msec() calls for each received batch.

Calling time_msec() (which calls clock_gettime()) too often might be
expensive.  With this commit OVS makes only one call per received
batch and caches the result.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:47 +01:00
+								                    uint16_t tcp_flags, long long now)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    uint16_t flags;
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
-												dpif-netdev: Make datapath and flow stats atomic.

A read operation from a non atomic shared value (without external
locking) can return incorrect values.  Using the atomic semantics
prevents this from happening.

However:
* No memory barriers are used.  We don't need that kind of consistency
  for statistics (we use relaxed operations).
* The updates are not atomic, just the loads and stores.  This is ok
  because there's a single writer.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-07 20:02:14 +01:00
+								    atomic_store_relaxed(&netdev_flow->stats.used, now);
 								    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
 								    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
 								    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
 								    flags |= tcp_flags;
 								    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
-												ovs-thread: Replace ovsthread_counter by more general ovsthread_stats.

This allows clients to do more than just increment a counter.  The
following commit will make the first use of that feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2014-03-19 07:47:12 -07:00
+								}
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								static int
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                 enum dpif_upcall_type type, const struct nlattr *userdata,
 								                 struct ofpbuf *actions, struct ofpbuf *put_actions)
 								{
-												dpif-netdev: Add per-pmd flow-table/classifier.

This commit changes the per dpif-netdev datapath flow-table/
classifier to per pmd-thread.  As direct benefit, datapath
and flow statistics no longer need to be protected by mutex
or be declared as per-thread variable, since they are only
written by the owning pmd thread.

As side effects, the flow-dump output of userspace datapath
can contain overlapping flows.  To reduce confusion, the dump
from different pmd thread will be separated by a title line.
In addition, the flow operations via 'ovs-appctl dpctl/*'
are modified so that if the given flow in_port corresponds
to a dpdk interface, the operation will be conducted to all
pmd threads recv from that interface (expect for flow-get
which will always be applied to non-pmd threads).

Signed-off-by: Alex Wang <alexw@nicira.com>
Tested-by: Mark D. Gray <mark.d.gray@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-10-12 18:18:47 -07:00
+								    struct dp_netdev *dp = pmd->dp;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								    if (OVS_UNLIKELY(!dp->upcall_cb)) {
 								        return ENODEV;
 								    }
 								    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
 								        struct ds ds = DS_EMPTY_INITIALIZER;
 								        char *packet_str;
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								        struct ofpbuf key;
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        struct odp_flow_key_parms odp_parms = {
 								            .flow = flow,
-												ovs-vswitchd: Avoid segfault for "netdev" datapath.

When the datapath, whose type is "netdev", processes packets
in userspce action, it may cause a segmentation fault. In the
dp_execute_userspace_action(), we pass the "wc" argument to
dp_netdev_upcall() using NULL. In the dp_netdev_upcall() call tree,
the "wc" will be used. For example, dp_netdev_upcall() uses the
&wc->masks for debugging, and flow_wildcards_init_for_packet()
uses the  "wc" if we disable megaflow, which is described in
more detail below.

Segmentation fault in flow_wildcards_init_for_packet:

    #0  0x0000000000468fe8 flow_wildcards_init_for_packet lib/flow.c:1275
    #1  0x0000000000436c0b upcall_cb ofproto/ofproto-dpif-upcall.c:1231
    #2  0x000000000045bd96 dp_netdev_upcall lib/dpif-netdev.c:3857
    #3  0x0000000000461bf3 dp_execute_userspace_action lib/dpif-netdev.c:4388
    #4  dp_execute_cb lib/dpif-netdev.c:4521
    #5  0x0000000000486ae2 odp_execute_actions lib/odp-execute.c:538
    #6  0x00000000004607f9 dp_netdev_execute_actions lib/dpif-netdev.c:4627
    #7  packet_batch_per_flow_execute lib/dpif-netdev.c:3927
    #8  dp_netdev_input__ lib/dpif-netdev.c:4229
    #9  0x0000000000460ba8 dp_netdev_input lib/dpif-netdev.c:4238
    #10 dp_netdev_process_rxq_port lib/dpif-netdev.c:2873
    #11 0x000000000046126e dpif_netdev_run lib/dpif-netdev.c:3000
    #12 0x000000000042baf5 type_run ofproto/ofproto-dpif.c:504
    #13 0x00000000004192bf ofproto_type_run ofproto/ofproto.c:1687
    #14 0x0000000000409965 bridge_run__ vswitchd/bridge.c:2875
    #15 0x000000000040f145 bridge_run vswitchd/bridge.c:2938
    #16 0x00000000004062e5 main vswitchd/ovs-vswitchd.c:111

Signed-off-by: nickcooper-zhangtonghao <nic@opencloud.tech>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-12-07 10:04:04 -08:00
+								            .mask = wc ? &wc->masks : NULL,
-												odp-util: Share fields between odp and dpif_backer.

Datapath support for some flow key fields is used inside ofproto-dpif as
well as odp-util. Share these fields using the same structure.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-30 16:43:03 -07:00
+								            .support = dp_netdev_support,
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        };
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								        ofpbuf_init(&key, 0);
-												odp-util: Convert flow serialization parameters to a struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2015-06-16 11:15:28 -07:00
+								        odp_flow_key_from_flow(&odp_parms, &key);
-												userspace: Add packet_type in dp_packet and flow

This commit adds a packet_type attribute to the structs dp_packet and flow
to explicitly carry the type of the packet as prepration for the
introduction of the so-called packet type-aware pipeline (PTAP) in OVS.

The packet_type is a big-endian 32 bit integer with the encoding as
specified in OpenFlow verion 1.5.

The upper 16 bits contain the packet type name space. Pre-defined values
are defined in openflow-common.h:

enum ofp_header_type_namespaces {
    OFPHTN_ONF = 0,             /* ONF namespace. */
    OFPHTN_ETHERTYPE = 1,       /* ns_type is an Ethertype. */
    OFPHTN_IP_PROTO = 2,        /* ns_type is a IP protocol number. */
    OFPHTN_UDP_TCP_PORT = 3,    /* ns_type is a TCP or UDP port. */
    OFPHTN_IPV4_OPTION = 4,     /* ns_type is an IPv4 option number. */
};

The lower 16 bits specify the actual type in the context of the name space.

Only name spaces 0 and 1 will be supported for now.

For name space OFPHTN_ONF the relevant packet type is 0 (Ethernet).
This is the default packet_type in OVS and the only one supported so far.
Packets of type (OFPHTN_ONF, 0) are called Ethernet packets.

In name space OFPHTN_ETHERTYPE the type is the Ethertype of the packet.
A packet of type (OFPHTN_ETHERTYPE, <Ethertype>) is a standard L2 packet
whith the Ethernet header (and any VLAN tags) removed to expose the L3
(or L2.5) payload of the packet. These will simply be called L3 packets.

The Ethernet address fields dl_src and dl_dst in struct flow are not
applicable for an L3 packet and must be zero. However, to maintain
compatibility with the large code base, we have chosen to copy the
Ethertype of an L3 packet into the the dl_type field of struct flow.

This does not mean that it will be possible to match on dl_type for L3
packets with PTAP later on. Matching must be done on packet_type instead.

New dp_packets are initialized with packet_type Ethernet. Ports that
receive L3 packets will have to explicitly adjust the packet_type.

Signed-off-by: Jean Tourrilhes <jt@labs.hpe.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Zoltan Balogh <zoltan.balogh@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-04-25 16:29:59 +00:00
+								        packet_str = ofp_dp_packet_to_string(packet_);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
+								        odp_flow_key_format(key.data, key.size, &ds);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
 								        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
 								                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
 								        ofpbuf_uninit(&key);
 								        free(packet_str);
-												ofpbuf: Simplify ofpbuf API.

ofpbuf was complicated due to its wide usage across all
layers of OVS, Now we have introduced independent dp_packet
which can be used for datapath packet, we can simplify ofpbuf.
Following patch removes DPDK mbuf and access API of ofpbuf
members.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-03-02 17:29:44 -08:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        ds_destroy(&ds);
 								    }
-												tun-metadata: Manage tunnel TLV mapping table on a per-bridge basis.

When using tunnel TLVs (at the moment, this means Geneve options), a
controller must first map the class and type onto an appropriate OXM
field so that it can be used in OVS flow operations. This table is
managed using OpenFlow extensions.

The original code that added support for TLVs made the mapping table
global as a simplification. However, this is not really logically
correct as the OpenFlow management commands are operating on a per-bridge
basis. This removes the original limitation to make the table per-bridge.

One nice result of this change is that it is generally clearer whether
the tunnel metadata is in datapath or OpenFlow format. Rather than
allowing ad-hoc format changes and trying to handle both formats in the
tunnel metadata functions, the format is more clearly separated by function.
Datapaths (both kernel and userspace) use datapath format and it is not
changed during the upcall process. At the beginning of action translation,
tunnel metadata is converted to OpenFlow format and flows and wildcards
are translated back at the end of the process.

As an additional benefit, this change improves performance in some flow
setup situations by keeping the tunnel metadata in the original packet
format in more cases. This helps when copies need to be made as the amount
of data touched is only what is present in the packet rather than the
maximum amount of metadata supported.

Co-authored-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Madhu Challa <challa@noironetworks.com>
Signed-off-by: Jesse Gross <jesse@kernel.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2016-04-19 18:36:04 -07:00
+								    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
 								                         actions, wc, put_actions, dp->upcall_aux);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								}
-												dpif-netdev: Avoid reading RSS hash when EMC is disabled.

When EMC is disabled the reading of RSS hash is skipped.
Also, for packets that are not recirculated it retrieves
the hash value without considering the recirc id.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 01:56:28 -07:00
+								static inline uint32_t
 								dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
 								                                const struct miniflow *mf)
 								{
 								    uint32_t hash;
 								    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
 								        hash = dp_packet_get_rss_hash(packet);
 								    } else {
 								        hash = miniflow_hash_5tuple(mf, 0);
 								        dp_packet_set_rss_hash(packet, hash);
 								    }
 								    return hash;
 								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline uint32_t
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
+								dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
 								                                const struct miniflow *mf)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
+								    uint32_t hash, recirc_depth;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Check for PKT_RX_RSS_HASH flag.

DPDK mbufs contain a valid RSS hash only if PKT_RX_RSS_HASH is
set in 'ol_flags'.  Otherwise the hash is garbage and doesn't
relate to the packet.

This fixes an issue with vhost, which, being a virtual NIC, doesn't
compute the hash.

Reported-by: Dongjun <dongj@dtdream.com>
Suggested-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Kevin Traynor <kevin.traynor@intel.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-06-16 19:16:24 +01:00
+								    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
 								        hash = dp_packet_get_rss_hash(packet);
 								    } else {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        hash = miniflow_hash_5tuple(mf, 0);
-												dp-packet: Rename 'dp_hash' in 'rss_hash'.

We already have the 'dp_hash' embedded in the metadata.  This caused
confusion in the code.  With this commit it should be clear that
'rss_hash' is the packet hash used for internal purposes, while
'md.dp_hash' is part of the flow, computed during the execution of
certain actions.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:48 +01:00
+								        dp_packet_set_rss_hash(packet, hash);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: Reset RSS hash when recirculating.

Having the same RSS hash after recirculation can cause unnecessary
collisions in the exact match cache.  A simple solution is to rehash it
with the recirculation depth if it is non-zero.

Suggested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-22 19:22:52 +01:00
 								    /* The RSS hash must account for the recirculation depth to avoid
 								     * collisions in the exact match cache */
 								    recirc_depth = *recirc_depth_get_unsafe();
 								    if (OVS_UNLIKELY(recirc_depth)) {
 								        hash = hash_finish(hash, recirc_depth);
 								        dp_packet_set_rss_hash(packet, hash);
 								    }
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    return hash;
 								}
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								struct packet_batch_per_flow {
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    unsigned int byte_count;
 								    uint16_t tcp_flags;
 								    struct dp_netdev_flow *flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    struct dp_packet_batch array;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								};
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
 								                             struct dp_packet *packet,
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								                             uint16_t tcp_flags)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
-												dp-packet: Remove ofpbuf dependency.

Currently dp-packet make use of ofpbuf for managing packet
buffers. That complicates ofpbuf, by making dp-packet
independent of ofpbuf both libraries can be optimized for
their own use case.
This avoids mapping operation between ofpbuf and dp_packet
in datapath upcalls.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-02-22 03:21:09 -08:00
+								    batch->byte_count += dp_packet_size(packet);
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								    batch->tcp_flags |= tcp_flags;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    batch->array.packets[batch->array.count++] = packet;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								}
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
 								                           struct dp_netdev_flow *flow)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    flow->batch = batch;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
+								    batch->flow = flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_packet_batch_init(&batch->array);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    batch->byte_count = 0;
 								    batch->tcp_flags = 0;
 								}
 								static inline void
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                              struct dp_netdev_pmd_thread *pmd)
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								{
 								    struct dp_netdev_actions *actions;
 								    struct dp_netdev_flow *flow = batch->flow;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								    dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								                        batch->tcp_flags, pmd->ctx.now / 1000);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
 								    actions = dp_netdev_flow_get_actions(flow);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                              actions->actions, actions->size);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								}
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								static inline void
-												dpif_packet: Rename to dp_packet

dp_packet is short and better name for datapath packet
structure.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2015-02-25 12:01:53 -08:00
+								dp_netdev_queue_batches(struct dp_packet *pkt,
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
-												dpif-netdev/netdev-dpdk: Fix line lengths.

Fix line lengths to be <= 79 as per coding style and so that checkpatch
will not show up existing warnings on these files.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-05 11:52:05 +01:00
+								                        struct packet_batch_per_flow *batches,
 								                        size_t *n_batches)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								    struct packet_batch_per_flow *batch = flow->batch;
-												dpif-netdev: Store batch pointer in dp_netdev_flow.

The userspace datapath

1. receives a batch of packets.
2. finds a 'netdev_flow' (megaflow) for each packet.
3. groups the packets in output batches based on the 'netdev_flow'.

Until now the grouping (2) was done using a simple algorithm with a
O(N^2) runtime, where N is the number of distinct megaflows of the packets
in the incoming batch.  This could quickly become a bottleneck, even with
a small number of megaflows.

With this commit the datapath simply stores in the 'netdev_flow' (the
megaflow) a pointer to the output batch, if one has been created for the
current input batch.  The pointer will be cleared when the output batch
is sent.

In a simple phy2phy test with 128 megaflows the throughput is more than
doubled.

The reason that stopped us from doing this change was that the
'netdev_flow' memory was shared between multiple threads: this is no
longer the case with the per-thread classifier.

Also, this commit reorders struct dp_netdev_flow to group toghether the
members used in the fastpath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:50 -07:00
-												dpif-netdev: Reduce code duplication

Code clean up to reduce code duplication.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:51:11 -08:00
+								    if (OVS_UNLIKELY(!batch)) {
 								        batch = &batches[(*n_batches)++];
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								        packet_batch_per_flow_init(batch, flow);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								    packet_batch_per_flow_update(batch, pkt, tcp_flags);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								static inline void
 								packet_enqueue_to_flow_map(struct dp_packet *packet,
 								                           struct dp_netdev_flow *flow,
 								                           uint16_t tcp_flags,
 								                           struct dp_packet_flow_map *flow_map,
 								                           size_t index)
 								{
 								    struct dp_packet_flow_map *map = &flow_map[index];
 								    map->flow = flow;
 								    map->packet = packet;
 								    map->tcp_flags = tcp_flags;
 								}
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								/* SMC lookup function for a batch of packets.
 								 * By doing batching SMC lookup, we can use prefetch
 								 * to hide memory access latency.
 								 */
 								static inline void
 								smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
 								            struct netdev_flow_key *keys,
 								            struct netdev_flow_key **missed_keys,
 								            struct dp_packet_batch *packets_,
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								            const int cnt,
 								            struct dp_packet_flow_map *flow_map,
 								            uint8_t *index_map)
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								{
 								    int i;
 								    struct dp_packet *packet;
 								    size_t n_smc_hit = 0, n_missed = 0;
 								    struct dfc_cache *cache = &pmd->flow_cache;
 								    struct smc_cache *smc_cache = &cache->smc_cache;
 								    const struct cmap_node *flow_node;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    int recv_idx;
 								    uint16_t tcp_flags;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
 								    /* Prefetch buckets for all packets */
 								    for (i = 0; i < cnt; i++) {
 								        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
 								    }
 								    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
 								        struct dp_netdev_flow *flow = NULL;
 								        flow_node = smc_entry_get(pmd, keys[i].hash);
 								        bool hit = false;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								        /* Get the original order of this packet in received batch. */
 								        recv_idx = index_map[i];
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
 								        if (OVS_LIKELY(flow_node != NULL)) {
 								            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
 								                /* Since we dont have per-port megaflow to check the port
 								                 * number, we need to  verify that the input ports match. */
 								                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
 								                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                    /* SMC hit and emc miss, we insert into EMC */
 								                    keys[i].len =
 								                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
-												dpif-netdev: Fix zero length keys insertion to EMC.

'key.len' should be calculated before inserting to EMC, otherwise
resulting entry will match with any packet with the same hash.

CC: Yipeng Wang <yipeng1.wang@intel.com>
Fixes: 60d8ccae135f ("dpif-netdev: Add SMC cache after EMC cache")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Yipeng Wang <yipeng1.wang@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-25 19:09:31 +03:00
+								                    emc_probabilistic_insert(pmd, &keys[i], flow);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                    /* Add these packets into the flow map in the same order
 								                     * as received.
 								                     */
 								                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
 								                                               flow_map, recv_idx);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                    n_smc_hit++;
 								                    hit = true;
 								                    break;
 								                }
 								            }
 								            if (hit) {
 								                continue;
 								            }
 								        }
 								        /* SMC missed. Group missed packets together at
 								         * the beginning of the 'packets' array. */
 								        dp_packet_batch_refill(packets_, packet, i);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
 								        /* Preserve the order of packet for flow batching. */
 								        index_map[n_missed] = recv_idx;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        /* Put missed keys to the pointer arrays return to the caller */
 								        missed_keys[n_missed++] = &keys[i];
 								    }
 								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
 								}
 								/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								 * miniflow is copied into 'keys' and the packet pointer is moved at the
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								 * beginning of the 'packets' array. The pointers of missed keys are put in the
 								 * missed_keys pointer array for future processing.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 *
 								 * The function returns the number of packets that needs to be processed in the
 								 * 'packets' array (they have been moved to the beginning of the vector).
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								 *
-												dpif-netdev: Fix comments in function headers.

Fix comments for emc_processing and dp_netdev_input__
regarding md_is_valid.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 13:54:34 -07:00
+								 * For performance reasons a caller may choose not to initialize the metadata
 								 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
 								 * is not valid and must be initialized by this function using 'port_no'.
 								 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
 								 * will be ignored.
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								 */
 								static inline size_t
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								dfc_processing(struct dp_netdev_pmd_thread *pmd,
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								               struct dp_packet_batch *packets_,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								               struct netdev_flow_key *keys,
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								               struct netdev_flow_key **missed_keys,
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								               struct packet_batch_per_flow batches[], size_t *n_batches,
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								               struct dp_packet_flow_map *flow_map,
 								               size_t *n_flows, uint8_t *index_map,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								               bool md_is_valid, odp_port_t port_no)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: optmizing emc_processing()

Commit d262ac2c60ce1da7b477737f70e8efd38b32502d introduced a slight
performance drop for the fast path, where every packets hits the
emc cache.  This patch removes that performance drop by only reloading
the key pointer on emc cache miss.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:48:50 -08:00
+								    struct netdev_flow_key *key = &keys[0];
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    size_t n_missed = 0, n_emc_hit = 0;
 								    struct dfc_cache *cache = &pmd->flow_cache;
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    struct dp_packet *packet;
-												dpif-netdev: Rename "size" variable to "cnt".

Commit 72c84bc (dp-packet: Enhance packet batch APIs.) changed how the amount
of packets to be processed is retrieved. In the process, the patch used "size"
as the variable holding the amount of packets rather than "cnt". Change this
back to match with the "emc_processing()" comment.

Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 13:57:54 -07:00
+								    const size_t cnt = dp_packet_batch_size(packets_);
-												dpif-netdev: Per-port configurable EMC.

Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios where we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
receive large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue introduced 'emc-enable' configurable to
enable/disable EMC on a per-port basis. Ex.:

  ovs-vsctl set interface dpdk0 other_config:emc-enable=false

EMC probability kept as is and it works for all the ports with
'emc-enable=true'.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-18 11:56:50 +03:00
+								    uint32_t cur_min = pmd->ctx.emc_insert_min;
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    int i;
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								    uint16_t tcp_flags;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    bool smc_enable_db;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    size_t map_cnt = 0;
 								    bool batch_enable = true;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    pmd_perf_update_counter(&pmd->perf_stats,
 								                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
 								                            cnt);
-												dpif-netdev: Skip EMC lookup when EMC is disabled.

Conditional EMC insert patch gives the flexibility to configure the
probability of flow insertion in to EMC. This also allows an option to
entirely disable EMC by setting 'emc-insert-inv-prob=0' which can be
useful at large number of parallel flows.

This patch skips EMC lookup when EMC is disabled. This is useful to
avoid wasting CPU cycles and also improve performance considerably.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
CC: Ciara Loftus <ciara.loftus@intel.com>
CC: Georg Schmuecking <georg.schmuecking@ericsson.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Acked-by: Darrell Ball dlu998@gmail.com

											
										
										
											2017-03-12 17:33:24 +00:00
-												dpif-netdev: Rename "size" variable to "cnt".

Commit 72c84bc (dp-packet: Enhance packet batch APIs.) changed how the amount
of packets to be processed is retrieved. In the process, the patch used "size"
as the variable holding the amount of packets rather than "cnt". Change this
back to match with the "emc_processing()" comment.

Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 13:57:54 -07:00
+								    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        struct dp_netdev_flow *flow;
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								        uint32_t mark;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
 								            dp_packet_delete(packet);
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								            continue;
 								        }
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dpif-netdev: Rename "size" variable to "cnt".

Commit 72c84bc (dp-packet: Enhance packet batch APIs.) changed how the amount
of packets to be processed is retrieved. In the process, the patch used "size"
as the variable holding the amount of packets rather than "cnt". Change this
back to match with the "emc_processing()" comment.

Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 13:57:54 -07:00
+								        if (i != cnt - 1) {
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								            struct dp_packet **packets = packets_->packets;
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								            /* Prefetch next packet data and metadata. */
-												dpif-netdev: Prefetch next packet before miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-15 19:06:39 +01:00
+								            OVS_PREFETCH(dp_packet_data(packets[i+1]));
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								            pkt_metadata_prefetch_init(&packets[i+1]->md);
-												dpif-netdev: Prefetch next packet before miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-06-15 19:06:39 +01:00
+								        }
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								        if (!md_is_valid) {
 								            pkt_metadata_init(&packet->md, port_no);
 								        }
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
 								        if ((*recirc_depth_get() == 0) &&
 								            dp_packet_has_flow_mark(packet, &mark)) {
 								            flow = mark_to_flow_find(pmd, mark);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								            if (OVS_LIKELY(flow)) {
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								                tcp_flags = parse_tcp_flags(packet);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                if (OVS_LIKELY(batch_enable)) {
 								                    dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
 								                                            n_batches);
 								                } else {
 								                    /* Flow batching should be performed only after fast-path
 								                     * processing is also completed for packets with emc miss
 								                     * or else it will result in reordering of packets with
 								                     * same datapath flows. */
 								                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
 								                                               flow_map, map_cnt++);
 								                }
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								                continue;
 								            }
 								        }
-												dpif-netdev: Load packet pointer only once in emc_processing()

For the machines I have access to, Reloading the same pointer from
memory seems to inhibit complier optimization somewhat.

In emc_processing(), using a single packet pointer, instead reloading
it from memory with packets[i], improves performance by 0.3 Mpps (tested
with 10G NIC pushing 64 byte packets, with the base line of 12.2 Mpps).

Besides improving performance, this patch should also improves code
readability.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-29 17:40:18 -08:00
+								        miniflow_extract(packet, &key->mf);
-												dpif-netdev: Avoid copying netdev_flow_key in emc_processing().

Before this commit, emc_processing() copied a netdev_flow_key if there was
no exact-match cache (EMC) hit.  This commit eliminates the copy by
constructing the netdev_flow_key in the place it would be copied.

Found by inspection.

Shahbaz (CCed) reports that this reduces the cost of an EMC miss by 72
cycles in his test case in which the EMC is disabled.  Presumably this
is similarly valuable in cases where the EMC merely has few hits.

For the original version of this patch, which was against a slightly
earlier version of OVS, Daniele reported that:

    - With EMC disabled, this increases throughput from 4.8 Mpps to 5.4
      Mpps.

    - With EMC enabled, this decreases throughput from 12.4 to 12.0 Mpps.

CC: Muhammad Shahbaz <mshahbaz@cs.princeton.edu>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-24 08:32:36 -08:00
+								        key->len = 0; /* Not computed yet. */
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        /* If EMC and SMC disabled skip hash computation */
 								        if (smc_enable_db == true || cur_min != 0) {
-												dpif-netdev: Avoid reading RSS hash when EMC is disabled.

When EMC is disabled the reading of RSS hash is skipped.
Also, for packets that are not recirculated it retrieves
the hash value without considering the recirc id.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 01:56:28 -07:00
+								            if (!md_is_valid) {
 								                key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
 								                        &key->mf);
 								            } else {
 								                key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
 								            }
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        }
 								        if (cur_min) {
 								            flow = emc_lookup(&cache->emc_cache, key);
-												dpif-netdev: Avoid reading RSS hash when EMC is disabled.

When EMC is disabled the reading of RSS hash is skipped.
Also, for packets that are not recirculated it retrieves
the hash value without considering the recirc id.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 01:56:28 -07:00
+								        } else {
 								            flow = NULL;
 								        }
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								        if (OVS_LIKELY(flow)) {
-												dpif-netdev: retrieve flow directly from the flow mark

So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Co-authored-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-06-25 16:21:05 +03:00
+								            tcp_flags = miniflow_get_tcp_flags(&key->mf);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								            n_emc_hit++;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								            if (OVS_LIKELY(batch_enable)) {
 								                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
 								                                        n_batches);
 								            } else {
 								                /* Flow batching should be performed only after fast-path
 								                 * processing is also completed for packets with emc miss
 								                 * or else it will result in reordering of packets with
 								                 * same datapath flows. */
 								                packet_enqueue_to_flow_map(packet, flow, tcp_flags,
 								                                           flow_map, map_cnt++);
 								            }
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								        } else {
-												dpif-netdev: drop swapping

emc_processing() moves all the missed packets towards the beginning of
packet array; matched packets are queued up into flow queues. Since the
remaining of the packet array is not used anymore, don't bother swap
packet pointers to save cycles and simplify logic.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-25 18:48:19 -08:00
+								            /* Exact match cache missed. Group missed packets together at
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								             * the beginning of the 'packets' array. */
 								            dp_packet_batch_refill(packets_, packet, i);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
 								            /* Preserve the order of packet for flow batching. */
 								            index_map[n_missed] = map_cnt;
 								            flow_map[map_cnt++].flow = NULL;
-												dpif-netdev: Correctly update 'key' in emc_processing().

The 'key' pointer must point at the first unused element in the key
array.

Fixes: b89c678b7a26 ("dpif-netdev: optmizing emc_processing()")
CC: Andy Zhou <azhou@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2016-02-02 20:38:11 -08:00
+								            /* 'key[n_missed]' contains the key of the current packet and it
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								             * will be passed to SMC lookup. The next key should be extracted
 								             * to 'keys[n_missed + 1]'.
 								             * We also maintain a pointer array to keys missed both SMC and EMC
 								             * which will be returned to the caller for future processing. */
 								            missed_keys[n_missed] = key;
-												dpif-netdev: Correctly update 'key' in emc_processing().

The 'key' pointer must point at the first unused element in the key
array.

Fixes: b89c678b7a26 ("dpif-netdev: optmizing emc_processing()")
CC: Andy Zhou <azhou@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@nicira.com>

											
										
										
											2016-02-02 20:38:11 -08:00
+								            key = &keys[++n_missed];
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
 								            /* Skip batching for subsequent packets to avoid reordering. */
 								            batch_enable = false;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								        }
 								    }
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    /* Count of packets which are not flow batched. */
 								    *n_flows = map_cnt;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
 								    if (!smc_enable_db) {
 								        return dp_packet_batch_size(packets_);
 								    }
 								    /* Packets miss EMC will do a batch lookup in SMC if enabled */
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    smc_lookup_batch(pmd, keys, missed_keys, packets_,
 								                     n_missed, flow_map, index_map);
-												dpif-netdev: Use miniflow as a flow key.

Use miniflow as a flow key in the userspace datapath classifier.  The
miniflow is expanded for upcalls, but for existing datapath flows, the
key need not be expanded.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Reviewed-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
											
										
										
											2014-04-18 08:26:57 -07:00
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    return dp_packet_batch_size(packets_);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								}
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								static inline int
-												dpif-netdev/netdev-dpdk: Fix line lengths.

Fix line lengths to be <= 79 as per coding style and so that checkpatch
will not show up existing warnings on these files.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-05 11:52:05 +01:00
+								handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
 								                     struct dp_packet *packet,
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								                     const struct netdev_flow_key *key,
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                     struct ofpbuf *actions, struct ofpbuf *put_actions)
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								{
 								    struct ofpbuf *add_actions;
 								    struct dp_packet_batch b;
 								    struct match match;
 								    ovs_u128 ufid;
 								    int error;
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
 								    match.tun_md.valid = false;
 								    miniflow_expand(&key->mf, &match.flow);
 								    ofpbuf_clear(actions);
 								    ofpbuf_clear(put_actions);
 								    dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
 								    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
 								                             &ufid, DPIF_UC_MISS, NULL, actions,
 								                             put_actions);
 								    if (OVS_UNLIKELY(error && error != ENOSPC)) {
 								        dp_packet_delete(packet);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								        return error;
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								    }
 								    /* The Netlink encoding of datapath flow keys cannot express
 								     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
 								     * tag is interpreted as exact match on the fact that there is no
 								     * VLAN.  Unless we refactor a lot of code that translates between
 								     * Netlink and struct flow representations, we have to do the same
-												dpif-netdev: Add vlan to mask for flow_put operation.

Datapath flows in dpif-netdev classifier always has exact match
mask set for vlan. We have to enable it for flow_put operation
too in order to avoid flow modification failure due to
classifier lookup with wrong hash.

Found by OFtest.

CC: Jan Scheurich <jan.scheurich@ericsson.com>
Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline")
Reported-by: Ben Pfaff <blp@ovn.org>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2018-September/352579.html
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-10-09 19:15:13 +03:00
+								     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
-												Add support for 802.1ad (QinQ tunneling)

Flow key handling changes:
 - Add VLAN header array in struct flow, to record multiple 802.1q VLAN
   headers.
 - Add dpif multi-VLAN capability probing. If datapath supports
   multi-VLAN, increase the maximum depth of nested OVS_KEY_ATTR_ENCAP.

Refactor VLAN handling in dpif-xlate:
 - Introduce 'xvlan' to track VLAN stack during flow processing.
 - Input and output VLAN translation according to the xbundle type.

Push VLAN action support:
 - Allow ethertype 0x88a8 in VLAN headers and push_vlan action.
 - Support push_vlan on dot1q packets.

Use other_config:vlan-limit in table Open_vSwitch to limit maximum VLANs
that can be matched. This allows us to preserve backwards compatibility.

Add test cases for VLAN depth limit, Multi-VLAN actions and QinQ VLAN
handling

Co-authored-by: Thomas F Herbert <thomasfherbert@gmail.com>
Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Co-authored-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: Eric Garver <e@erig.me>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-03-01 17:47:59 -05:00
+								    if (!match.wc.masks.vlans[0].tci) {
 								        match.wc.masks.vlans[0].tci = htons(0xffff);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								    }
 								    /* We can't allow the packet batching in the next loop to execute
 								     * the actions.  Otherwise, if there are any slow path actions,
 								     * we'll send the packet up twice. */
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    dp_packet_batch_init_packet(&b, packet);
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                              actions->data, actions->size);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
 								    add_actions = put_actions->size ? put_actions : actions;
 								    if (OVS_LIKELY(error != ENOSPC)) {
 								        struct dp_netdev_flow *netdev_flow;
 								        /* XXX: There's a race window where a flow covering this packet
 								         * could have already been installed since we last did the flow
 								         * lookup before upcall.  This could be solved by moving the
 								         * mutex lock outside the loop, but that's an awful long time
 								         * to be locking everyone out of making flow installs.  If we
 								         * move to a per-core classifier, it would be reasonable. */
 								        ovs_mutex_lock(&pmd->flow_mutex);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								        if (OVS_LIKELY(!netdev_flow)) {
 								            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
 								                                             add_actions->data,
 								                                             add_actions->size);
 								        }
 								        ovs_mutex_unlock(&pmd->flow_mutex);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
 								        smc_insert(pmd, key, hash);
-												dpif-netdev: Conditional EMC insert

Unconditional insertion of EMC entries results in EMC thrashing at high
numbers of parallel flows. When this occurs, the performance of the EMC
often falls below that of the dpcls classifier, rendering the EMC
practically useless.

Instead of unconditionally inserting entries into the EMC when a miss
occurs, use a 1% probability of insertion. This ensures that the most
frequent flows have the highest chance of creating an entry in the EMC,
and the probability of thrashing the EMC is also greatly reduced.

The probability of insertion is configurable, via the
other_config:emc-insert-inv-prob option. This value sets the average
probability of insertion to 1/emc-insert-inv-prob.

For example the following command changes the insertion probability to
(on average) 1 in every 20 packets ie. 1/20 ie. 5%.

ovs-vsctl set Open_vSwitch . other_config:emc-insert-inv-prob=20

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Co-authored-by: Georg Schmuecking <georg.schmuecking@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2017-02-16 10:22:10 +00:00
+								        emc_probabilistic_insert(pmd, key, netdev_flow);
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								    }
-												dpif-netdev: Detailed performance stats for PMDs

This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-04-19 19:40:45 +02:00
+								    if (pmd_perf_metrics_enabled(pmd)) {
 								        /* Update upcall stats. */
 								        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
 								        struct pmd_perf_stats *s = &pmd->perf_stats;
 								        s->current.upcalls++;
 								        s->current.upcall_cycles += cycles;
 								        histogram_add_sample(&s->cycles_per_upcall, cycles);
 								    }
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    return error;
-												dpif-netdev: Refactor fast path process function.

Once datapath support large packets, we need to segment packet before
sending it to upcall. Refactoring this code make it bit cleaner.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:32 -07:00
+								}
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								static inline void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								fast_path_processing(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                     struct dp_packet_batch *packets_,
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                     struct netdev_flow_key **keys,
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                     struct dp_packet_flow_map *flow_map,
 								                     uint8_t *index_map,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                     odp_port_t in_port)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in fast_path_processing.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:11:17 -07:00
+								    const size_t cnt = dp_packet_batch_size(packets_);
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								#if !defined(__CHECKER__) && !defined(_WIN32)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    const size_t PKT_ARRAY_SIZE = cnt;
 								#else
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								    /* Sparse or MSVC doesn't like variable length array. */
-												dpdk: Ditch MAX_PKT_BURST macro.

The MAX_PKT_BURST and NETDEV_MAX_RX_BATCH macros had a confusing
relationship.  They basically purport to do the same thing, making it
unclear which is the source of truth.

Furthermore, while NETDEV_MAX_RX_BATCH was 256, MAX_PKT_BURST was 32,
meaning we never process a batch larger than 32 packets further adding
to the confusion.

This patch resolves the issue by removing MAX_PKT_BURST completely,
and shrinking the new NETDEV_MAX_BURST macro to only 32.  This should
have no change in the execution path except shrinking a couple of
structs and memory allocations (can't hurt).

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-16 08:18:20 -07:00
+								    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#endif
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in fast_path_processing.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:11:17 -07:00
+								    struct dp_packet *packet;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    struct dpcls *cls;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev *dp = pmd->dp;
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    int lookup_cnt = 0, add_lookup_cnt;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    bool any_miss;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								    for (size_t i = 0; i < cnt; i++) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        /* Key length is needed in all the cases, hash computed on demand. */
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Get the classifier for the in_port */
 								    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
 								    if (OVS_LIKELY(cls)) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
 								                                rules, cnt, &lookup_cnt);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    } else {
 								        any_miss = true;
 								        memset(rules, 0, sizeof(rules));
 								    }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
 								        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
 								        struct ofpbuf actions, put_actions;
 								        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
 								        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            struct dp_netdev_flow *netdev_flow;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            if (OVS_LIKELY(rules[i])) {
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                continue;
 								            }
 								            /* It's possible that an earlier slow path execution installed
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								             * a rule covering this flow.  In this case, it's a lot cheaper
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								             * to catch it here than execute a miss. */
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                                                    &add_lookup_cnt);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            if (netdev_flow) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								                lookup_cnt += add_lookup_cnt;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                rules[i] = &netdev_flow->cr;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								                continue;
 								            }
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								            int error = handle_packet_upcall(pmd, packet, keys[i],
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                                             &actions, &put_actions);
 								            if (OVS_UNLIKELY(error)) {
 								                upcall_fail_cnt++;
 								            } else {
 								                upcall_ok_cnt++;
 								            }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								        }
 								        ofpbuf_uninit(&actions);
 								        ofpbuf_uninit(&put_actions);
 								        fat_rwlock_unlock(&dp->upcall_rwlock);
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								    } else if (OVS_UNLIKELY(any_miss)) {
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            if (OVS_UNLIKELY(!rules[i])) {
-												dpif-netdev: Use DP_PACKET_BATCH_FOR_EACH in fast_path_processing.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:11:17 -07:00
+								                dp_packet_delete(packet);
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								                upcall_fail_cnt++;
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            }
 								        }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    }
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								        struct dp_netdev_flow *flow;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								        /* Get the original order of this packet in received batch. */
 								        int recv_idx = index_map[i];
 								        uint16_t tcp_flags;
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        if (OVS_UNLIKELY(!rules[i])) {
-												dpif-netdev: Batch megaflow lookup.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-06-23 18:28:43 -07:00
+								            continue;
 								        }
 								        flow = dp_netdev_flow_cast(rules[i]);
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
 								        smc_insert(pmd, keys[i], hash);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        emc_probabilistic_insert(pmd, keys[i], flow);
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								        /* Add these packets into the flow map in the same order
 								         * as received.
 								         */
 								        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
 								        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
 								                                   flow_map, recv_idx);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								    }
-												dpif-netdev: Refactor PMD performance into dpif-netdev-perf

Add module dpif-netdev-perf to host all PMD performance-related
data structures and functions in dpif-netdev. Refactor the PMD
stats handling in dpif-netdev and delegate whatever possible into
the new module, using clean interfaces to shield dpif-netdev from
the implementation details. Accordingly, the all PMD statistics
members are moved from the main struct dp_netdev_pmd_thread into
a dedicated member of type struct pmd_perf_stats.

Include Darrel's prior refactoring of PMD stats contained in
[PATCH v5,2/3] dpif-netdev: Refactor some pmd stats:

1. The cycles per packet counts are now based on packets
received rather than packet passes through the datapath.

2. Packet counters are now kept for packets received and
packets recirculated. These are kept as separate counters for
maintainability reasons. The cost of incrementing these counters
is negligible.  These new counters are also displayed to the user.

3. A display statistic is added for the average number of
datapath passes per packet. This should be useful for user
debugging and understanding of packet processing.

4. The user visible 'miss' counter is used for successful upcalls,
rather than the sum of sucessful and unsuccessful upcalls. Hence,
this becomes what user historically understands by OVS 'miss upcall'.
The user display is annotated to make this clear as well.

5. The user visible 'lost' counter remains as failed upcalls, but
is annotated to make it clear what the meaning is.

6. The enum pmd_stat_type is annotated to make the usage of the
stats counters clear.

7. The subtable lookup stats is renamed to make it clear that it
relates to masked lookups.

8. The PMD stats test is updated to handle the new user stats of
packets received, packets recirculated and average number of datapath
passes per packet.

On top of that introduce a "-pmd <core>" option to the PMD info
commands to filter the output for a single PMD.

Made the pmd-stats-show output a bit more readable by adding a blank
between colon and value.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 12:27:23 +01:00
+								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
 								                            cnt - upcall_ok_cnt - upcall_fail_cnt);
 								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
 								                            lookup_cnt);
 								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
 								                            upcall_ok_cnt);
 								    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
 								                            upcall_fail_cnt);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								/* Packets enter the datapath from a port (or from recirculation) here.
 								 *
-												dpif-netdev: Fix comments in function headers.

Fix comments for emc_processing and dp_netdev_input__
regarding md_is_valid.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-01 13:54:34 -07:00
+								 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
 								 * When false the metadata in 'packets' need to be initialized. */
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								static void
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                  struct dp_packet_batch *packets,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								                  bool md_is_valid, odp_port_t port_no)
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								{
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								#if !defined(__CHECKER__) && !defined(_WIN32)
-												dpif-netdev: Remove 'cnt' in dp_netdev_input__().

There is little use of 'cnt' variable in dp_netdev_input__(). Get rid of
it and use dp_packet_batch_size() to initialize PKT_ARRAY_SIZE.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-09-22 02:16:05 -07:00
+								    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#else
-												dpif-netdev: Avoid variable length array on MSVC.

MSVC does not like variable length array either.

This patch treats the following error:

lib/dpif-netdev.c(2272) : error C2057: expected constant expression
lib/dpif-netdev.c(2272) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2272) : error C2133: 'batches' : unknown size
lib/dpif-netdev.c(2273) : error C2057: expected constant expression
lib/dpif-netdev.c(2273) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2273) : error C2133: 'mfs' : unknown size
lib/dpif-netdev.c(2274) : error C2057: expected constant expression
lib/dpif-netdev.c(2274) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2274) : error C2133: 'rules' : unknown size
lib/dpif-netdev.c(2363) : warning C4034: sizeof returns 0
lib/dpif-netdev.c(2381) : error C2057: expected constant expression
lib/dpif-netdev.c(2381) : error C2466: cannot allocate an array of constant size 0
lib/dpif-netdev.c(2381) : error C2133: 'keys' : unknown size
make[2]: *** [lib/dpif-netdev.lo] Error 1

Signed-off-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-01 20:11:54 +00:00
+								    /* Sparse or MSVC doesn't like variable length array. */
-												dpdk: Ditch MAX_PKT_BURST macro.

The MAX_PKT_BURST and NETDEV_MAX_RX_BATCH macros had a confusing
relationship.  They basically purport to do the same thing, making it
unclear which is the source of truth.

Furthermore, while NETDEV_MAX_RX_BATCH was 256, MAX_PKT_BURST was 32,
meaning we never process a batch larger than 32 packets further adding
to the confusion.

This patch resolves the issue by removing MAX_PKT_BURST completely,
and shrinking the new NETDEV_MAX_BURST macro to only 32.  This should
have no change in the execution path except shrinking a couple of
structs and memory allocations (can't hurt).

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-16 08:18:20 -07:00
+								    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								#endif
-												dpif-netdev/netdev-dpdk: Fix line lengths.

Fix line lengths to be <= 79 as per coding style and so that checkpatch
will not show up existing warnings on these files.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-05 11:52:05 +01:00
+								    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
 								        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
-												dpif-netdev: rename packet_batch

Next patch introduces new structure named packet_batch. So
I am renaming it to packet_batch_per_flow.
This does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:28 -07:00
+								    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    size_t n_batches;
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
 								    uint8_t index_map[PKT_ARRAY_SIZE];
 								    size_t n_flows, i;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    odp_port_t in_port;
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    n_batches = 0;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                   flow_map, &n_flows, index_map, md_is_valid, port_no);
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								    if (!dp_packet_batch_is_empty(packets)) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        /* Get ingress port from first packet's metadata. */
 								        in_port = packets->packets[0]->md.in_port.odp_port;
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								        fast_path_processing(pmd, packets, missed_keys,
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								                             flow_map, index_map, in_port);
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    }
-												dpif-netdev: Avoid reordering of packets in a batch with same megaflow

OVS reads packets in batches from a given port and packets in the
batch are subjected to potentially 3 levels of lookups to identify
the datapath megaflow entry (or flow) associated with the packet.
Each megaflow entry has a dedicated buffer in which packets that match
the flow classification criteria are collected. This buffer helps OVS
perform batch processing for all packets associated with a given flow.

Each packet in the received batch is first subjected to lookup in the
Exact Match Cache (EMC). Each EMC entry will point to a flow. If the
EMC lookup is successful, the packet is moved from the rx batch to the
per-flow buffer.

Packets that did not match any EMC entry are rearranged in the rx batch
at the beginning and are now subjected to a lookup in the megaflow cache.
Packets that match a megaflow cache entry are *appended* to the per-flow
buffer.

Packets that do not match any megaflow entry are subjected to slow-path
processing through the upcall mechanism. This cannot change the order of
packets as by definition upcall processing is only done for packets
without matching megaflow entry.

The EMC entry match fields encompass all potentially significant header
fields, typically more than specified in the associated flow's match
criteria. Hence, multiple EMC entries can point to the same flow. Given
that per-flow batching happens at each lookup stage, packets belonging
to the same megaflow can get re-ordered because some packets match EMC
entries while others do not.

The following example can illustrate the issue better. Consider
following batch of packets (labelled P1 to P8) associated with a single
TCP connection and associated with a single flow. Let us assume that
packets with just the ACK bit set in TCP flags have been received in a
prior batch also and a corresponding EMC entry exists.

1. P1 (TCP Flag: ACK)
2. P2 (TCP Flag: ACK)
3. P3 (TCP Flag: ACK)
4. P4 (TCP Flag: ACK, PSH)
5. P5 (TCP Flag: ACK)
6. P6 (TCP Flag: ACK)
7. P7 (TCP Flag: ACK)
8. P8 (TCP Flag: ACK)

The megaflow classification criteria does not include TCP flags while
the EMC match criteria does. Thus, all packets other than P4 match
the existing EMC entry and are moved to the per-flow packet batch.
Subsequently, packet P4 is moved to the same per-flow packet batch as
a result of the megaflow lookup. Though the packets have all been
correctly classified as being associated with the same flow, the
packet order has not been preserved because of the per-flow batching
performed during the EMC lookup stage. This packet re-ordering has
performance implications for TCP applications.

This patch preserves the packet ordering by performing the per-flow
batching after both the EMC and megaflow lookups are complete. As an
optimization, packets are flow-batched in emc processing till any
packet in the batch has an EMC miss.

A new flow map is maintained to keep the original order of packet
along with flow information. Post fastpath processing, packets from
flow map are *appended* to per-flow buffer.

Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-27 23:56:37 +05:30
+								    /* Batch rest of packets which are in flow map. */
 								    for (i = 0; i < n_flows; i++) {
 								        struct dp_packet_flow_map *map = &flow_map[i];
 								        if (OVS_UNLIKELY(!map->flow)) {
 								            continue;
 								        }
 								        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
 								                                batches, &n_batches);
 								     }
-												dpif-netdev: Add comments to dp_netdev_input__().

Add comments in dp_netdev_input__() to explain the reason behind
clearing the flow batches before packet_batch_execute().

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-14 15:37:08 +01:00
+								    /* All the flow batches need to be reset before any call to
 								     * packet_batch_per_flow_execute() as it could potentially trigger
 								     * recirculation. When a packet matching flow ‘j’ happens to be
 								     * recirculated, the nested call to dp_netdev_input__() could potentially
 								     * classify the packet as matching another flow - say 'k'. It could happen
 								     * that in the previous call to dp_netdev_input__() that same flow 'k' had
 								     * already its own batches[k] still waiting to be served.  So if its
 								     * ‘batch’ member is not reset, the recirculated packet would be wrongly
 								     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
-												dpif-netdev: Clear flow batches before execute.

When executing actions, it's possible a recirculation will occur
causing dp_netdev_input() to be called multiple times.  If the batch
pointers embedded in dp_netdev_flow aren't cleared, it's possible
packets after the recirculation will be reinserted into a batch
associated with the original lookup.  This could be very bad.

This patch fixes the problem by zeroing out flow batch pointers before
calling packet_batch_execute().  This probably has a slightly negative
performance impact, though I haven't tried it.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2015-05-20 16:55:17 -07:00
+								    for (i = 0; i < n_batches; i++) {
 								        batches[i].flow->batch = NULL;
 								    }
-												dpif-netdev: Share emc and fast path output batches.

Until now the exact match cache processing was able to handle only four
megaflows.  The rest of the packets was passed to the megaflow
classifier.

The limit was arbitraly set to four also because the algorithm used to
group packets in output batches didn't perform well with a lot of
megaflows.

After changing the algorithm and after some performance testing it seems
much better just to share the same output batches between the exact
match cache and the megaflow classifier.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2015-05-18 10:47:51 -07:00
+								    for (i = 0; i < n_batches; i++) {
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								        packet_batch_per_flow_execute(&batches[i], pmd);
-												dpif-netdev: Exact match cache

Since lookups in the classifier can be pretty expensive,
we introduce this (thread local) cache which simply
compares the miniflows of the packets

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-08-29 16:06:43 -07:00
+								    }
 								}
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								static void
 								dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                struct dp_packet_batch *packets,
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								                odp_port_t port_no)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    dp_netdev_input__(pmd, packets, false, port_no);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								}
 								static void
 								dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                      struct dp_packet_batch *packets)
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    dp_netdev_input__(pmd, packets, true, 0);
-												dpif-netdev: Delay packets' metadata initialization.

When a group of packets arrives from a port, we loop through them to
initialize metadata and then we loop through them again to extract the
flow and perform the exact match classification.

This commit combines the two loops into one, and initializes packet->md
in emc_processing() to improve performance.

Since emc_processing() might also be called after recirculation (in
which case the metadata is already valid), an extra parameter is added
to support both cases.

This commits also implements simple prefetching of packet metadata,
to further improve performance.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Andy Zhou <azhou@ovn.org>
Acked-by: Chandran, Sugesh <sugesh.chandran@intel.com>

											
										
										
											2016-01-28 17:47:51 -08:00
+								}
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								struct dp_netdev_execute_aux {
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								    struct dp_netdev_pmd_thread *pmd;
-												conntrack: Add 'dl_type' parameter to conntrack_execute().

Now that dpif_execute has a 'flow' member, it's pretty easy to access a
the flow (or the matching megaflow) in dp_execute_cb().

This means that's not necessary anymore for the connection tracker to
reextract 'dl_type' from the packet, it can be passed as a parameter.

This change means that we have to complicate sightly test-conntrack to
group the packets by dl_type before passing them to the connection
tracker.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-05-25 18:10:09 -07:00
+								    const struct flow *flow;
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								};
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								static void
 								dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
 								                                 void *aux)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    dp->dp_purge_aux = aux;
 								    dp->dp_purge_cb = cb;
 								}
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								static void
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
 								                               void *aux)
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    dp->upcall_aux = aux;
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dp->upcall_cb = cb;
 								}
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								static void
 								dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                               bool purge)
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								{
 								    struct tx_port *tx;
 								    struct dp_netdev_port *port;
 								    long long interval;
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
-												dpif-netdev: Fix xps revalidation.

Revalidation should work in case of 'dynamic_txqs == true'.

Fixes: 324c8374852a ("dpif-netdev: XPS (Transmit Packet Steering) implementation.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-29 11:07:21 +03:00
+								        if (!tx->port->dynamic_txqs) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            continue;
 								        }
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								        interval = pmd->ctx.now - tx->last_used;
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            port = tx->port;
 								            ovs_mutex_lock(&port->txq_used_mutex);
 								            port->txq_used[tx->qid]--;
 								            ovs_mutex_unlock(&port->txq_used_mutex);
 								            tx->qid = -1;
 								        }
 								    }
 								}
 								static int
 								dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                           struct tx_port *tx)
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								{
 								    struct dp_netdev_port *port;
 								    long long interval;
 								    int i, min_cnt, min_qid;
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    interval = pmd->ctx.now - tx->last_used;
 								    tx->last_used = pmd->ctx.now;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								        return tx->qid;
 								    }
 								    port = tx->port;
 								    ovs_mutex_lock(&port->txq_used_mutex);
 								    if (tx->qid >= 0) {
 								        port->txq_used[tx->qid]--;
 								        tx->qid = -1;
 								    }
 								    min_cnt = -1;
 								    min_qid = 0;
 								    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
 								        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
 								            min_cnt = port->txq_used[i];
 								            min_qid = i;
 								        }
 								    }
 								    port->txq_used[min_qid]++;
 								    tx->qid = min_qid;
 								    ovs_mutex_unlock(&port->txq_used_mutex);
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    dpif_netdev_xps_revalidate_pmd(pmd, false);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
 								    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
 								             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
 								    return min_qid;
 								}
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								static struct tx_port *
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
 								                          odp_port_t port_no)
 								{
 								    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
 								}
 								static struct tx_port *
 								pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
 								                           odp_port_t port_no)
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								{
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    return tx_port_lookup(&pmd->send_port_cache, port_no);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								static int
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                const struct nlattr *attr,
 								                struct dp_packet_batch *batch)
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								{
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *tun_port;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    const struct ovs_action_push_tnl *data;
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								    int err;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
 								    data = nl_attr_get(attr);
-												openvswitch.h: Use odp_port_t for port numbers in userspace-only structs.

Using the correct type reduces the need for type conversions.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Reviewed-by: nickcooper-zhangtonghao <nic@opencloud.tech>

											
										
										
											2017-06-18 08:59:30 +08:00
+								    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    if (!tun_port) {
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								        err = -EINVAL;
 								        goto error;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    }
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								    err = netdev_push_header(tun_port->port->netdev, batch, data);
-												dpif-netdev: Fix memory leak in tunnel header push action.

in case of error from netdev_push_header() batch of packets was not
freed. Following patch fixes this issue.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:10 -07:00
+								    if (!err) {
 								        return 0;
 								    }
 								error:
 								    dp_packet_delete_batch(batch, true);
 								    return err;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								}
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								static void
 								dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                            struct dp_packet *packet, bool should_steal,
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								                            struct flow *flow, ovs_u128 *ufid,
 								                            struct ofpbuf *actions,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                            const struct nlattr *userdata)
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								{
 								    struct dp_packet_batch b;
 								    int error;
 								    ofpbuf_clear(actions);
 								    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
 								                             DPIF_UC_ACTION, userdata, actions,
 								                             NULL);
 								    if (!error || error == ENOSPC) {
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								        dp_packet_batch_init_packet(&b, packet);
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                                  actions->data, actions->size);
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								    } else if (should_steal) {
-												dpif-netdev: Refactor userspace action

Large segment support need to use this refactored function to
send individual segments.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:33:44 -07:00
+								        dp_packet_delete(packet);
 								    }
 								}
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								static void
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								              const struct nlattr *a, bool should_steal)
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								    OVS_NO_THREAD_SAFETY_ANALYSIS
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
+								{
 								    struct dp_netdev_execute_aux *aux = aux_;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    uint32_t *depth = recirc_depth_get();
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
+								    struct dp_netdev_pmd_thread *pmd = aux->pmd;
 								    struct dp_netdev *dp = pmd->dp;
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    int type = nl_attr_type(a);
-												dpif-netdev: Add pmd thread local port cache for transmission.

A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-05 18:41:09 -07:00
+								    struct tx_port *p;
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    switch ((enum ovs_action_attr)type) {
 								    case OVS_ACTION_ATTR_OUTPUT:
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								        p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
-												dpif-netdev: delete lost packets in dp_execute_cb()

This commit fixes memory leaks in dp_execute_cb() in two cases:
    - when the output port cannot be found
    - when the recirculation depth is exceeded

Reported-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-25 11:39:34 -07:00
+								        if (OVS_LIKELY(p)) {
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								            struct dp_packet *packet;
 								            struct dp_packet_batch out;
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								            if (!should_steal) {
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								                dp_packet_batch_clone(&out, packets_);
 								                dp_packet_batch_reset_cutlen(packets_);
 								                packets_ = &out;
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								            }
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								            dp_packet_batch_apply_cutlen(packets_);
-												dpif-netdev: Unique and sequential tx_qids.

Currently tx_qid is equal to pmd->core_id. This leads to unexpected
behavior if pmd-cpu-mask different from '/(0*)(1|3|7)?(f*)/',
e.g. if core_ids are not sequential, or doesn't start from 0, or both.

Example:
	starting 2 pmd threads with 1 port, 2 rxqs per port,
	pmd-cpu-mask = 00000014 and let dev->real_n_txq = 2

	It that case pmd_1->tx_qid = 2, pmd_2->tx_qid = 4 and
	txq_needs_locking = true (if device hasn't ovs_numa_get_n_cores()+1
	queues).

	In that case, after truncating in netdev_dpdk_send__():
		'qid = qid % dev->real_n_txq;'
	pmd_1: qid = 2 % 2 = 0
	pmd_2: qid = 4 % 2 = 0

	So, both threads will call dpdk_queue_pkts() with same qid = 0.
	This is unexpected behavior if there is 2 tx queues in device.
	Queue #1 will not be used and both threads will lock queue #0
	on each send.

Fix that by using sequential tx_qids.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-01-26 09:12:34 +03:00
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								#ifdef DPDK_NETDEV
 								            if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
 								                             && packets_->packets[0]->source
 								                                != p->output_pkts.packets[0]->source)) {
 								                /* XXX: netdev-dpdk assumes that all packets in a single
 								                 *      output batch has the same source. Flush here to
 								                 *      avoid memory access issues. */
 								                dp_netdev_pmd_flush_output_on_port(pmd, p);
 								            }
 								#endif
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
+								            if (dp_packet_batch_size(&p->output_pkts)
 								                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
 								                /* Flush here to avoid overflow. */
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								                dp_netdev_pmd_flush_output_on_port(pmd, p);
 								            }
-												dpif-netdev: Time based output batching.

This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:53 +03:00
 								            if (dp_packet_batch_is_empty(&p->output_pkts)) {
 								                pmd->n_output_batches++;
 								            }
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dpif-netdev: Count cycles on per-rxq basis.

Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:52 +03:00
+								                p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
 								                                                             pmd->ctx.last_rxq;
-												dpif-netdev: Output packet batching.

While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com

											
										
										
											2017-12-14 14:59:24 +03:00
+								                dp_packet_batch_add(&p->output_pkts, packet);
 								            }
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        }
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								        break;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								    case OVS_ACTION_ATTR_TUNNEL_PUSH:
-												dpif-netdev: Free packets on TUNNEL_PUSH if should_steal.

Unconditional return may cause packet leak in case of
'should_steal == true'.

Additionally, removed redundant checking for depth level.

CC: Sugesh Chandran <sugesh.chandran@intel.com>
Fixes: 7c12dfc527a5 ("tunneling: Avoid datapath-recirc by
                      combining recirc actions at xlate.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokess <ian.stokes@intel.com>

											
										
										
											2018-05-24 12:51:21 +03:00
+								        if (should_steal) {
 								            /* We're requested to push tunnel header, but also we need to take
 								             * the ownership of these packets. Thus, we can avoid performing
 								             * the action, because the caller will not use the result anyway.
 								             * Just break to free the batch. */
 								            break;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								        }
-												dpif-netdev: Free packets on TUNNEL_PUSH if should_steal.

Unconditional return may cause packet leak in case of
'should_steal == true'.

Additionally, removed redundant checking for depth level.

CC: Sugesh Chandran <sugesh.chandran@intel.com>
Fixes: 7c12dfc527a5 ("tunneling: Avoid datapath-recirc by
                      combining recirc actions at xlate.")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokess <ian.stokes@intel.com>

											
										
										
											2018-05-24 12:51:21 +03:00
+								        dp_packet_batch_apply_cutlen(packets_);
 								        push_tnl_action(pmd, a, packets_);
 								        return;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
 								    case OVS_ACTION_ATTR_TUNNEL_POP:
 								        if (*depth < MAX_RECIRC_DEPTH) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch *orig_packets_ = packets_;
-												lib: Use nl_attr_get_odp_port().

This helper is a little tidier than the alternative. Use it treewide.

Signed-off-by: Joe Stringer <joe@ovn.org>
Acked-by: Simon Horman <simon.horman@netronome.com>

											
										
										
											2016-11-14 13:29:05 -08:00
+								            odp_port_t portno = nl_attr_get_odp_port(a);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
-												dpif-netdev: Don't try to output on a device without txqs.

Tunnel devices have 0 txqs and don't support netdev_send().  While
netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
on output, and that might be confused by devices with no txqs.

It seems better to have different structures in the fast path for ports
that support netdev_{push,pop}_header (tunnel devices), and ports that
support netdev_send.  With this we can also remove a branch in
netdev_send().

This is also necessary for a future commit, which starts DPDK devices
without txqs.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								            p = pmd_tnl_port_cache_lookup(pmd, portno);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								            if (p) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                struct dp_packet_batch tnl_pkt;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                if (!should_steal) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                    dp_packet_batch_clone(&tnl_pkt, packets_);
 								                    packets_ = &tnl_pkt;
 								                    dp_packet_batch_reset_cutlen(orig_packets_);
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                }
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                dp_packet_batch_apply_cutlen(packets_);
-												dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:41 +03:00
+								                netdev_pop_header(p->port->netdev, packets_);
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								                if (dp_packet_batch_is_empty(packets_)) {
-												netdev: Return number of packet from netdev_pop_header()

Current tunnel-pop API does not allow the netdev implementation
retain a packet but STT can keep a packet from batch of packets
during TCP reassembly processing. To return exact count of
valid packet STT need to pass this number of packet parameter
as a reference.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:06 -07:00
+								                    return;
 								                }
-												dpif-netdev: Fix memory leak in tunnel header pop action.

The tunnel header pop action can leak batch of packet
in case of error. Following patch fixex the error code path.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:37 -07:00
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								                struct dp_packet *packet;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								                    packet->md.in_port.odp_port = portno;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                }
-												dpif-netdev: Fix memory leak in tunnel header pop action.

The tunnel header pop action can leak batch of packet
in case of error. Following patch fixex the error code path.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:37 -07:00
 								                (*depth)++;
 								                dp_netdev_recirculate(pmd, packets_);
 								                (*depth)--;
-												openvswitch: Userspace tunneling.

Following patch adds support for userspace tunneling. Tunneling
needs three more component first is routing table which is configured by
caching kernel routes and second is ARP cache which build automatically
by snooping arp. And third is tunnel protocol table which list all
listening protocols which is populated by vswitchd as tunnel ports
are added. GRE and VXLAN protocol support is added in this patch.

Tunneling works as follows:
On packet receive vswitchd check if this packet is targeted to tunnel
port. If it is then vswitchd inserts tunnel pop action which pops
header and sends packet to tunnel port.
On packet xmit rather than generating Set tunnel action it generate
tunnel push action which has tunnel header data. datapath can use
tunnel-push action data to generate header for each packet and
forward this packet to output port. Since tunnel-push action
contains most of packet header vswitchd needs to lookup routing
table and arp table to build this action.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-11-11 11:53:47 -08:00
+								                return;
 								            }
 								        }
 								        break;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								    case OVS_ACTION_ATTR_USERSPACE:
 								        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch *orig_packets_ = packets_;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            const struct nlattr *userdata;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            struct dp_packet_batch usr_pkt;
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            struct ofpbuf actions;
 								            struct flow flow;
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								            ovs_u128 ufid;
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            bool clone = false;
-												odp-execute: Refine signatures for odp_execute_actions() callbacks.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-09-20 12:47:33 -07:00
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
 								            ofpbuf_init(&actions, 0);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								            if (packets_->trunc) {
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                if (!should_steal) {
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								                    dp_packet_batch_clone(&usr_pkt, packets_);
 								                    packets_ = &usr_pkt;
 								                    clone = true;
 								                    dp_packet_batch_reset_cutlen(orig_packets_);
 								                }
 								                dp_packet_batch_apply_cutlen(packets_);
 								            }
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								            struct dp_packet *packet;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								                flow_extract(packet, &flow);
-												dpif: Generate flow_hash for revalidators in dpif.

This patch shifts the responsibility for determining the hash for a flow
from the revalidation logic down to the dpif layer. This assists in
handling backward-compatibility for revalidation with the upcoming
unique flow identifier "UFID" patches.

A 128-bit UFID was selected to minimize the likelihood of hash conflicts.
Handler threads will not install a flow that has an identical UFID as
another flow, to prevent misattribution of stats and to ensure that the
correct flow key cache is used for revalidation.

For datapaths that do not support UFID, which is currently all
datapaths, the dpif will generate the UFID and pass it up during upcall
and flow_dump. This is generated based on the datapath flow key.

Later patches will add support for datapaths to store and interpret this
UFID, in which case the dpif has a responsibility to pass it through
transparently.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-09-24 15:24:39 +12:00
+								                dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                                            &ufid, &actions, userdata);
-												netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads

DPDK mempools rely on rte_lcore_id() to implement a thread-local cache.
Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to
the "thread-local" cache, causing crashes.

This commit resolves the issue with the following changes:

- Every non pmd thread has the same lcore_id (0, for management reasons), which
  is not shared with any pmd thread (lcore_id for pmd threads now start from 1)
- DPDK mbufs must be allocated/freed in pmd threads. When there is the need to
  use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be
  held.
- The previous change does not allow us anymore to pass DPDK mbufs to handler
  threads: therefore this commit partially revert 143859ec63d45e. Now packets
  are copied for upcall processing. We can remove the extra memcpy by
  processing upcalls in the pmd thread itself.

With the introduction of the extra locking, the packet throughput will be lower
in the following cases:

- When using internal (tap) devices with DPDK devices on the same datapath.
  Anyway, to support internal devices efficiently, we needed DPDK KNI devices,
  which will be proper pmd devices and will not need this locking.
- When packets are processed in the slow path by non pmd threads. This overhead
  can be avoided by handling the upcalls directly in pmd threads (a change that
  has already been proposed by Ryan Wilson)

Also, the following two fixes have been introduced:
- In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put().
  This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option
- Do not bulk free mbufs in a transmission queue. They may belong to different
  mempools

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-07-17 14:29:36 -07:00
+								            }
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
 								            if (clone) {
 								                dp_packet_delete_batch(packets_, true);
 								            }
-												dpif-netdev: Streamline miss handling.

This patch avoids the relatively inefficient miss handling processes
dictated by the dpif process, by calling into ofproto-dpif directly
through a callback.

Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-26 15:39:58 -07:00
+								            ofpbuf_uninit(&actions);
 								            fat_rwlock_unlock(&dp->upcall_rwlock);
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
 								        }
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								        break;
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								    case OVS_ACTION_ATTR_RECIRC:
 								        if (*depth < MAX_RECIRC_DEPTH) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            struct dp_packet_batch recirc_pkts;
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								            if (!should_steal) {
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								               dp_packet_batch_clone(&recirc_pkts, packets_);
 								               packets_ = &recirc_pkts;
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
+								            }
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								            struct dp_packet *packet;
-												dp-packet: Add index to DP_PACKET_BATCH_FOR_EACH to prevent shadowing.

Signed-off-by: Justin Pettit <jpettit@ovn.org>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-02-27 10:41:30 -08:00
+								            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-												dp-packet: Enhance packet batch APIs.

One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-17 15:56:58 -08:00
+								                packet->md.recirc_id = nl_attr_get_u32(a);
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								            }
-												dpif-netdev: Batch packets when recirculating.

Now that we have per packet metadata, there's no need to split packet
batches when recirculating.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-15 19:11:49 +01:00
 								            (*depth)++;
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								            dp_netdev_recirculate(pmd, packets_);
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								            (*depth)--;
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
+								            return;
-												ofproto/bond: Implement bond megaflow using recirculation

Infrastructure to enable megaflow support for bond ports using
recirculation. This patch adds the following features:
* Generate RECIRC action when bond can benefit from recirculation.
* Populate post recirculation rules in a hidden table. Currently table 254.
* Uses post recirculation rules for bond rebalancing
* A recirculation implementation in dpif-netdev.

The goal of this patch is to be able to megaflow bond outputs and
thus greatly improve performance. However, this patch does not
actually improve the megaflow generation. It is left for a later commit.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-05 15:27:31 -08:00
+								        }
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
 								        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
-												dpif-netdev: user space datapath recirculation

Add basic recirculation infrastructure and user space
data path support for it. The following bond mega flow patch will
make use of this infrastructure.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-04 15:36:03 -08:00
+								        break;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    case OVS_ACTION_ATTR_CT: {
 								        const struct nlattr *b;
-												conntrack: Force commit.

Userspace support for force commit.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
											
										
										
											2017-03-08 17:18:23 -08:00
+								        bool force = false;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								        bool commit = false;
 								        unsigned int left;
 								        uint16_t zone = 0;
 								        const char *helper = NULL;
 								        const uint32_t *setmark = NULL;
 								        const struct ovs_key_ct_labels *setlabel = NULL;
-												dpdk: Parse NAT netlink for userspace datapath.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-30 10:49:25 -07:00
+								        struct nat_action_info_t nat_action_info;
 								        struct nat_action_info_t *nat_action_info_ref = NULL;
 								        bool nat_config = false;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
 								        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
 								                                 nl_attr_get_size(a)) {
 								            enum ovs_ct_attr sub_type = nl_attr_type(b);
 								            switch(sub_type) {
-												datapath: Add force commit.

Upstream patch:

    commit dd41d33f0b033885211a5d6f3ee19e73238aa9ee
    Author: Jarno Rajahalme <jarno@ovn.org>
    Date:   Thu Feb 9 11:22:00 2017 -0800

    openvswitch: Add force commit.

    Stateful network admission policy may allow connections to one
    direction and reject connections initiated in the other direction.
    After policy change it is possible that for a new connection an
    overlapping conntrack entry already exists, where the original
    direction of the existing connection is opposed to the new
    connection's initial packet.

    Most importantly, conntrack state relating to the current packet gets
    the "reply" designation based on whether the original direction tuple
    or the reply direction tuple matched.  If this "directionality" is
    wrong w.r.t. to the stateful network admission policy it may happen
    that packets in neither direction are correctly admitted.

    This patch adds a new "force commit" option to the OVS conntrack
    action that checks the original direction of an existing conntrack
    entry.  If that direction is opposed to the current packet, the
    existing conntrack entry is deleted and a new one is subsequently
    created in the correct direction.

    Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
    Acked-by: Pravin B Shelar <pshelar@ovn.org>
    Acked-by: Joe Stringer <joe@ovn.org>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
											
										
										
											2017-03-08 17:18:23 -08:00
+								            case OVS_CT_ATTR_FORCE_COMMIT:
-												conntrack: Force commit.

Userspace support for force commit.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
											
										
										
											2017-03-08 17:18:23 -08:00
+								                force = true;
 								                /* fall through. */
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								            case OVS_CT_ATTR_COMMIT:
 								                commit = true;
 								                break;
 								            case OVS_CT_ATTR_ZONE:
 								                zone = nl_attr_get_u16(b);
 								                break;
 								            case OVS_CT_ATTR_HELPER:
 								                helper = nl_attr_get_string(b);
 								                break;
 								            case OVS_CT_ATTR_MARK:
 								                setmark = nl_attr_get(b);
 								                break;
 								            case OVS_CT_ATTR_LABELS:
 								                setlabel = nl_attr_get(b);
 								                break;
-												datapath: Add eventmask support to CT action.

Upstream commit:

    commit 120645513f55a4ac5543120d9e79925d30a0156f
    Author: Jarno Rajahalme <jarno@ovn.org>
    Date:   Fri Apr 21 16:48:06 2017 -0700

    openvswitch: Add eventmask support to CT action.

    Add a new optional conntrack action attribute OVS_CT_ATTR_EVENTMASK,
    which can be used in conjunction with the commit flag
    (OVS_CT_ATTR_COMMIT) to set the mask of bits specifying which
    conntrack events (IPCT_*) should be delivered via the Netfilter
    netlink multicast groups.  Default behavior depends on the system
    configuration, but typically a lot of events are delivered.  This can be
    very chatty for the NFNLGRP_CONNTRACK_UPDATE group, even if only some
    types of events are of interest.

    Netfilter core init_conntrack() adds the event cache extension, so we
    only need to set the ctmask value.  However, if the system is
    configured without support for events, the setting will be skipped due
    to extension not being found.

    Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
    Reviewed-by: Greg Rose <gvrose8192@gmail.com>
    Acked-by: Joe Stringer <joe@ovn.org>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2017-04-27 10:34:42 -07:00
+								            case OVS_CT_ATTR_EVENTMASK:
 								                /* Silently ignored, as userspace datapath does not generate
 								                 * netlink events. */
 								                break;
-												dpdk: Parse NAT netlink for userspace datapath.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-30 10:49:25 -07:00
+								            case OVS_CT_ATTR_NAT: {
 								                const struct nlattr *b_nest;
 								                unsigned int left_nest;
 								                bool ip_min_specified = false;
 								                bool proto_num_min_specified = false;
 								                bool ip_max_specified = false;
 								                bool proto_num_max_specified = false;
 								                memset(&nat_action_info, 0, sizeof nat_action_info);
 								                nat_action_info_ref = &nat_action_info;
 								                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
 								                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
 								                    switch (sub_type_nest) {
 								                    case OVS_NAT_ATTR_SRC:
 								                    case OVS_NAT_ATTR_DST:
 								                        nat_config = true;
 								                        nat_action_info.nat_action |=
 								                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
 								                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
 								                        break;
 								                    case OVS_NAT_ATTR_IP_MIN:
 								                        memcpy(&nat_action_info.min_addr,
 								                               nl_attr_get(b_nest),
 								                               nl_attr_get_size(b_nest));
 								                        ip_min_specified = true;
 								                        break;
 								                    case OVS_NAT_ATTR_IP_MAX:
 								                        memcpy(&nat_action_info.max_addr,
 								                               nl_attr_get(b_nest),
 								                               nl_attr_get_size(b_nest));
 								                        ip_max_specified = true;
 								                        break;
 								                    case OVS_NAT_ATTR_PROTO_MIN:
 								                        nat_action_info.min_port =
 								                            nl_attr_get_u16(b_nest);
 								                        proto_num_min_specified = true;
 								                        break;
 								                    case OVS_NAT_ATTR_PROTO_MAX:
 								                        nat_action_info.max_port =
 								                            nl_attr_get_u16(b_nest);
 								                        proto_num_max_specified = true;
 								                        break;
 								                    case OVS_NAT_ATTR_PERSISTENT:
 								                    case OVS_NAT_ATTR_PROTO_HASH:
 								                    case OVS_NAT_ATTR_PROTO_RANDOM:
 								                        break;
 								                    case OVS_NAT_ATTR_UNSPEC:
 								                    case __OVS_NAT_ATTR_MAX:
 								                        OVS_NOT_REACHED();
 								                    }
 								                }
 								                if (ip_min_specified && !ip_max_specified) {
 								                    nat_action_info.max_addr = nat_action_info.min_addr;
 								                }
 								                if (proto_num_min_specified && !proto_num_max_specified) {
 								                    nat_action_info.max_port = nat_action_info.min_port;
 								                }
 								                if (proto_num_min_specified || proto_num_max_specified) {
 								                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
 								                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
 								                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
 								                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
 								                    }
 								                }
 								                break;
 								            }
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								            case OVS_CT_ATTR_UNSPEC:
 								            case __OVS_CT_ATTR_MAX:
 								                OVS_NOT_REACHED();
 								            }
 								        }
-												dpdk: Parse NAT netlink for userspace datapath.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Daniele Di Proietto <diproiettod@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-05-30 10:49:25 -07:00
+								        /* We won't be able to function properly in this case, hence
 								         * complain loudly. */
 								        if (nat_config && !commit) {
 								            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
 								            VLOG_WARN_RL(&rl, "NAT specified without commit.");
 								        }
-												conntrack: Force commit.

Userspace support for force commit.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
											
										
										
											2017-03-08 17:18:23 -08:00
+								        conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
-												conntrack: Allow specified alg port numbers.

Algs can use variable control port numbers for servers.
The main use case is a kind of feeble security measure; the
thinking being by some is that it obscures the alg traffic.
It is really not very effective, but the kernel has this
capability. This patch mimics the capability.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Aaron Conole <aconole@redhat.com>

											
										
										
											2017-12-04 08:13:06 -08:00
+								                          commit, zone, setmark, setlabel, aux->flow->tp_src,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                          aux->flow->tp_dst, helper, nat_action_info_ref,
-												dpif-netdev: Use microsecond granularity.

Upcoming time-based output batching will require microsecond
granularity for it's flexible configuration.

Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-15 13:20:51 +03:00
+								                          pmd->ctx.now / 1000);
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
+								        break;
-												dpif-netdev: Execute conntrack action.

This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    }
-												Add support for connection tracking.

This patch adds a new action and fields to OVS that allow connection
tracking to be performed. This support works in conjunction with the
Linux kernel support merged into the Linux-4.3 development cycle.

Packets have two possible states with respect to connection tracking:
Untracked packets have not previously passed through the connection
tracker, while tracked packets have previously been through the
connection tracker. For OpenFlow pipeline processing, untracked packets
can become tracked, and they will remain tracked until the end of the
pipeline. Tracked packets cannot become untracked.

Connections can be unknown, uncommitted, or committed. Packets which are
untracked have unknown connection state. To know the connection state,
the packet must become tracked. Uncommitted connections have no
connection state stored about them, so it is only possible for the
connection tracker to identify whether they are a new connection or
whether they are invalid. Committed connections have connection state
stored beyond the lifetime of the packet, which allows later packets in
the same connection to be identified as part of the same established
connection, or related to an existing connection - for instance ICMP
error responses.

The new 'ct' action transitions the packet from "untracked" to
"tracked" by sending this flow through the connection tracker.
The following parameters are supported initally:

- "commit": When commit is executed, the connection moves from
  uncommitted state to committed state. This signals that information
  about the connection should be stored beyond the lifetime of the
  packet within the pipeline. This allows future packets in the same
  connection to be recognized as part of the same "established" (est)
  connection, as well as identifying packets in the reply (rpl)
  direction, or packets related to an existing connection (rel).
- "zone=[u16|NXM]": Perform connection tracking in the zone specified.
  Each zone is an independent connection tracking context. When the
  "commit" parameter is used, the connection will only be committed in
  the specified zone, and not in other zones. This is 0 by default.
- "table=NUMBER": Fork pipeline processing in two. The original instance
  of the packet will continue processing the current actions list as an
  untracked packet. An additional instance of the packet will be sent to
  the connection tracker, which will be re-injected into the OpenFlow
  pipeline to resume processing in the specified table, with the
  ct_state and other ct match fields set. If the table is not specified,
  then the packet is submitted to the connection tracker, but the
  pipeline does not fork and the ct match fields are not populated. It
  is strongly recommended to specify a table later than the current
  table to prevent loops.

When the "table" option is used, the packet that continues processing in
the specified table will have the ct_state populated. The ct_state may
have any of the following flags set:

- Tracked (trk): Connection tracking has occurred.
- Reply (rpl): The flow is in the reply direction.
- Invalid (inv): The connection tracker couldn't identify the connection.
- New (new): This is the beginning of a new connection.
- Established (est): This is part of an already existing connection.
- Related (rel): This connection is related to an existing connection.

For more information, consult the ovs-ofctl(8) man pages.

Below is a simple example flow table to allow outbound TCP traffic from
port 1 and drop traffic from port 2 that was not initiated by port 1:

    table=0,priority=1,action=drop
    table=0,arp,action=normal
    table=0,in_port=1,tcp,ct_state=-trk,action=ct(commit,zone=9),2
    table=0,in_port=2,tcp,ct_state=-trk,action=ct(zone=9,table=1)
    table=1,in_port=2,ct_state=+trk+est,tcp,action=1
    table=1,in_port=2,ct_state=+trk+new,tcp,action=drop

Based on original design by Justin Pettit, contributions from Thomas
Graf and Daniele Di Proietto.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-08-11 10:56:09 -07:00
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								    case OVS_ACTION_ATTR_METER:
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                            pmd->ctx.now);
-												dpif-netdev: Simple DROP meter implementation.

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:57 -08:00
+								        break;
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_PUSH_VLAN:
 								    case OVS_ACTION_ATTR_POP_VLAN:
 								    case OVS_ACTION_ATTR_PUSH_MPLS:
 								    case OVS_ACTION_ATTR_POP_MPLS:
 								    case OVS_ACTION_ATTR_SET:
-												lib/odp: Masked set action execution and printing.

Add a new action type OVS_ACTION_ATTR_SET_MASKED, and support for
parsing, printing, and committing them.

Masked set actions add a mask, immediately following the netlink
attribute data, within the netlink attribute itself.  Thus the key
attribute size for a masked set action is exactly double of the
non-masked set action.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2014-09-05 15:44:19 -07:00
+								    case OVS_ACTION_ATTR_SET_MASKED:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_SAMPLE:
-												dpif-netdev: Remove redundant hash action handling.

odp_execute_actions() already handles hash execution part.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-11-13 15:03:39 -08:00
+								    case OVS_ACTION_ATTR_HASH:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case OVS_ACTION_ATTR_UNSPEC:
-												ofp-actions: Add truncate action.

The patch adds a new action to support packet truncation.  The new action
is formatted as 'output(port=n,max_len=m)', as output to port n, with
packet size being MIN(original_size, m).

One use case is to enable port mirroring to send smaller packets to the
destination port so that only useful packet information is mirrored/copied,
saving some performance overhead of copying entire packet payload.  Example
use case is below as well as shown in the testcases:

    - Output to port 1 with max_len 100 bytes.
    - The output packet size on port 1 will be MIN(original_packet_size, 100).
    # ovs-ofctl add-flow br0 'actions=output(port=1,max_len=100)'

    - The scope of max_len is limited to output action itself.  The following
      packet size of output:1 and output:2 will be intact.
    # ovs-ofctl add-flow br0 \
            'actions=output(port=1,max_len=100),output:1,output:2'
    - The Datapath actions shows:
    # Datapath actions: trunc(100),1,1,2

Tested-at: https://travis-ci.org/williamtu/ovs-travis/builds/140037134
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>

											
										
										
											2016-06-24 07:42:30 -07:00
+								    case OVS_ACTION_ATTR_TRUNC:
-												datapath: add Ethernet push and pop actions

Upstream commit:
    commit 91820da6ae85904d95ed53bf3a83f9ec44a6b80a
    Author: Jiri Benc <jbenc@redhat.com>
    Date:   Thu Nov 10 16:28:23 2016 +0100

    openvswitch: add Ethernet push and pop actions

    It's not allowed to push Ethernet header in front of another Ethernet
    header.

    It's not allowed to pop Ethernet header if there's a vlan tag. This
    preserves the invariant that L3 packet never has a vlan tag.

    Based on previous versions by Lorand Jakab and Simon Horman.

    Signed-off-by: Lorand Jakab <lojakab@cisco.com>
    Signed-off-by: Simon Horman <simon.horman@netronome.com>
    Signed-off-by: Jiri Benc <jbenc@redhat.com>
    Acked-by: Pravin B Shelar <pshelar@ovn.org>
    Signed-off-by: David S. Miller <davem@davemloft.net>

[Committer notes]

Fix build with the upstream commit by folding in the required switch
case enum handlers.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Joe Stringer <joe@ovn.org>

											
										
										
											2017-02-06 21:04:41 +08:00
+								    case OVS_ACTION_ATTR_PUSH_ETH:
 								    case OVS_ACTION_ATTR_POP_ETH:
-												dpif-netdev: Add clone action

Add support for userspace datapath clone action.  The clone action
provides an action envelope to enclose an action list.
For example, with actions A, B, C and D,  and an action list:
      A, clone(B, C), D

The clone action will ensure that:

- D will see the same packet, and any meta states, such as flow, as
  action B.

- D will be executed regardless whether B, or C drops a packet. They
  can only drop a clone.

- When B drops a packet, clone will skip all remaining actions
  within the clone envelope. This feature is useful when we add
  meter action later:  The meter action can be implemented as a
  simple action without its own envolop (unlike the sample action).
  When necessary, the flow translation layer can enclose a meter action
  in clone.

The clone action is very similar with the OpenFlow clone action.
This is by design to simplify vswitchd flow translation logic.

Without datapath clone, vswitchd simulate the effect by inserting
datapath actions to "undo" clone actions. The above flow will be
translated into   A, B, C, -C, -B, D.

However, there are two issues:
- The resulting datapath action list may be longer without using
  clone.

- Some actions, such as NAT may not be possible to reverse.

This patch implements clone() simply with packet copy. The performance
can be improved with later patches, for example, to delay or avoid
packet copy if possible.  It seems datapath should have enough context
to carry out such optimization without the userspace context.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>

											
										
										
											2017-01-10 18:13:47 -08:00
+								    case OVS_ACTION_ATTR_CLONE:
-												nsh: rework NSH netlink keys and actions

This patch changes OVS_KEY_ATTR_NSH
to nested attribute and adds three new NSH sub attribute keys:

    OVS_NSH_KEY_ATTR_BASE: for length-fixed NSH base header
    OVS_NSH_KEY_ATTR_MD1:  for length-fixed MD type 1 context
    OVS_NSH_KEY_ATTR_MD2:  for length-variable MD type 2 metadata

Its intention is to align to NSH kernel implementation.

NSH match fields, set and PUSH_NSH action all use the below
nested attribute format:

OVS_KEY_ATTR_NSH begin
    OVS_NSH_KEY_ATTR_BASE
    OVS_NSH_KEY_ATTR_MD1
OVS_KEY_ATTR_NSH end

or

OVS_KEY_ATTR_NSH begin
    OVS_NSH_KEY_ATTR_BASE
    OVS_NSH_KEY_ATTR_MD2
OVS_KEY_ATTR_NSH end

In addition, NSH encap and decap actions are renamed as push_nsh
and pop_nsh to meet action naming convention.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-01-06 13:47:51 +08:00
+								    case OVS_ACTION_ATTR_PUSH_NSH:
 								    case OVS_ACTION_ATTR_POP_NSH:
-												dpif: Add support for OVS_ACTION_ATTR_CT_CLEAR

This supports using the ct_clear action in the kernel datapath. To
preserve compatibility with current ct_clear behavior on old kernels, we
only pass this action down to the datapath if a probe reveals the
datapath actually supports it.

Signed-off-by: Eric Garver <e@erig.me>
Acked-by: William Tu <u9012063@gmail.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-01-19 14:21:51 -05:00
+								    case OVS_ACTION_ATTR_CT_CLEAR:
-												odp-execute: Consolidate callbacks.

Use one callback instead of many, helps in adding new functionality
later on.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-30 15:58:58 -08:00
+								    case __OVS_ACTION_ATTR_MAX:
 								        OVS_NOT_REACHED();
-												dpif: Allow execute to modify the packet.

Allowing the packet to be modified by execution allows less data
copying for userspace action execution.  Some users of the
dpif_execute already expect that the packet may be modified.  This
patch makes this behavior uniform and makes the userspace datapath and
the execution helpers modify the packet as it is being executed.
Userspace action now steals the packet if given permission, as the
packet is normally not needed after it.  The only exception is the
sample action, and this is accounted for my keeping track of any
actions that could be following the userspace action.

The packet in dpif_upcall is changed from a pointer to a struct,
allowing the packet to be honest about it's headroom.  After this
change the packet can safely be pushed on over the precarious 4 byte
limit earlier allowed by the netlink data preceding the packet.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2013-12-16 08:14:52 -08:00
+								    }
-												dpif-netdev: Fix (packet) memory leaks in the slow path.

If a packet didn't match a rule in the fast path classifier its memory was
never freed. The issue was particularly clear with DPDK devices because it was
not possible to process more than ~250000 DPDK mbufs in the slow path.

This commit fixes the problem by:
* calling dpif_packet_delete() if the upcalls are disabled
* passing may_steal==true to dp_netdev_execute_actions() during normal upcall
  processing

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-09-19 16:20:01 -07:00
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								    dp_packet_delete_batch(packets_, should_steal);
-												datapath: Move Netlink PID for userspace actions from flows to actions.

Commit b063d9f06 "datapath: Use unicast Netlink sockets for upcalls" that
switched from multicast to unicast Netlink for sending upcalls added a
Netlink PID to each kernel flow, used by OVS_ACTION_ATTR_USERSPACE actions
within the flow as target.

This commit drops this per-flow PID in favor of a per-action PID, because
that is more flexible.  It does not yet make use of this additional
flexibility, so behavior should not change.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Bug #7559.

											
										
										
											2011-10-12 16:24:54 -07:00
+								}
-												datapath: Refactor actions in terms of match fields.

Almost all current actions can be expressed in the form of
push/pop/set <field>, where field is one of the match fields. We can
create three base actions and take a field. This has both a nice
symmetry and avoids inconsistencies where we can match on the vlan
TPID but not set it.
Following patch converts all actions to this new format.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

Bug #7115

											
										
										
											2011-10-21 14:38:54 -07:00
+								static void
-												dpif-netdev: Create multiple pmd threads by default.

With this commit, ovs by default will create one pmd thread
for each numa node and pin the pmd thread to available cpu
core on the numa node.

NON_PMD_CORE_ID (currently 0) is used to reserve a particular
cpu core for the I/O of all non-pmd threads.  No pmd thread
can be pinned to this reserved core.

As side-effects of this commit:

-  pmd thread will not be created, if there is no dpdk interface
   from the corresponding numa node added to ovs.

- the exact-match cache for non-pmd threads is removed from
  'struct dp_netdev'.  Instead, all non-pmd threads will use
  the exact-match cache defined in the 'struct dp_netdev_pmd_thread'
  for NON_PMD_CORE_ID.

- the rx packet processing functions are refactored to use
  'struct dp_netdev_pmd_thread' as input.

- the 'netdev_send()' function will be called with the proper
  queue id.

- both pmd and non-pmd threads can call the dpif_netdev_execute().
  so, use a per-thread key to help recognize the calling thread.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>


											
										
										
											2014-09-05 14:14:20 -07:00
+								dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
-												dpif-netdev: create batch object

DPDK datapath operate on batch of packets. To pass the batch of
packets around we use packets array and count.  Next patch needs
to associate meta-data with each batch of packets. So Introducing
a batch structure to make handling the metadata easier.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jesse Gross <jesse@kernel.org>

											
										
										
											2016-05-17 17:32:33 -07:00
+								                          struct dp_packet_batch *packets,
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								                          bool should_steal, const struct flow *flow,
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								                          const struct nlattr *actions, size_t actions_len)
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								{
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    struct dp_netdev_execute_aux aux = { pmd, flow };
-												dpif-netdev: Maintain the original key during execution.

Userspace action needs the original flow key.  This also
matches the kernel datapath behavior.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-11-14 14:35:58 -08:00
-												odp-execute: Rename 'may_steal' to 'should_steal'.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-05-16 19:24:46 -07:00
+								    odp_execute_actions(&aux, packets, should_steal, actions,
-												dpif-netdev: batch packet processing

This change in dpif-netdev allows faster packet processing for devices which
implement batching (netdev-dpdk currently).

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>

											
										
										
											2014-06-23 11:43:59 -07:00
+								                        actions_len, dp_execute_cb);
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								}
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								struct dp_netdev_ct_dump {
 								    struct ct_dpif_dump_state up;
 								    struct conntrack_dump dump;
 								    struct conntrack *ct;
 								    struct dp_netdev *dp;
 								};
 								static int
 								dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
-												dpctl: Add new 'ct-bkts' command.

With the command:
 ovs-appctl dpctl/ct-bkts
shows the number of connections per bucket.

By using a threshold:
 ovs-appctl dpctl/ct-bkts gt=N
for each bucket shows the number of connections when they
are greater than N.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 20:12:03 -07:00
+								                          const uint16_t *pzone, int *ptot_bkts)
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    struct dp_netdev_ct_dump *dump;
 								    dump = xzalloc(sizeof *dump);
 								    dump->dp = dp;
 								    dump->ct = &dp->conntrack;
-												dpctl: Add new 'ct-bkts' command.

With the command:
 ovs-appctl dpctl/ct-bkts
shows the number of connections per bucket.

By using a threshold:
 ovs-appctl dpctl/ct-bkts gt=N
for each bucket shows the number of connections when they
are greater than N.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-08-01 20:12:03 -07:00
+								    conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
 								    *dump_ = &dump->up;
 								    return 0;
 								}
 								static int
 								dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
 								                         struct ct_dpif_dump_state *dump_,
 								                         struct ct_dpif_entry *entry)
 								{
 								    struct dp_netdev_ct_dump *dump;
 								    INIT_CONTAINER(dump, dump_, up);
 								    return conntrack_dump_next(&dump->dump, entry);
 								}
 								static int
 								dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 								                         struct ct_dpif_dump_state *dump_)
 								{
 								    struct dp_netdev_ct_dump *dump;
 								    int err;
 								    INIT_CONTAINER(dump, dump_, up);
 								    err = conntrack_dump_done(&dump->dump);
 								    free(dump);
 								    return err;
 								}
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								static int
-												ct-dpif,dpif-netlink: Support conntrack flush by ct 5-tuple

This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple, and provides the implementation in dpif-netlink.
The implementation of dpif-netlink in the linux datapath utilizes the
NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
nf_conntrack.  Future patches will add support for the userspace and
Windows datapaths.

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-07 10:40:03 -08:00
+								dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
 								                     const struct ct_dpif_tuple *tuple)
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
-												ct-dpif,dpif-netlink: Support conntrack flush by ct 5-tuple

This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple, and provides the implementation in dpif-netlink.
The implementation of dpif-netlink in the linux datapath utilizes the
NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
nf_conntrack.  Future patches will add support for the userspace and
Windows datapaths.

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-07 10:40:03 -08:00
+								    if (tuple) {
-												conntrack: Support conntrack flush by ct 5-tuple

This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple in dpif-netdev.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2018-02-12 14:02:37 -08:00
+								        return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
-												ct-dpif,dpif-netlink: Support conntrack flush by ct 5-tuple

This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple, and provides the implementation in dpif-netlink.
The implementation of dpif-netlink in the linux datapath utilizes the
NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
nf_conntrack.  Future patches will add support for the userspace and
Windows datapaths.

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2017-12-07 10:40:03 -08:00
+								    }
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    return conntrack_flush(&dp->conntrack, zone);
 								}
-												dpctl conntrack: Add get and set maxconns command.

Get and set dpctl commands are added for conntrack maxconns.
These commands are only supported in the userspace
datapath at this time.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-01-08 15:18:42 -08:00
+								static int
 								dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    return conntrack_set_maxconns(&dp->conntrack, maxconns);
 								}
 								static int
 								dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    return conntrack_get_maxconns(&dp->conntrack, maxconns);
 								}
-												dpctl conntrack: Add get number of connections.

A get command is added for number of conntrack connections.
This command is only supported in the userspace datapath
at this time.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-01-08 15:18:43 -08:00
+								static int
 								dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
 								{
 								    struct dp_netdev *dp = get_dp_netdev(dpif);
 								    return conntrack_get_nconns(&dp->conntrack, nconns);
 								}
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								const struct dpif_class dpif_netdev_class = {
 								    "netdev",
-												dpif-netdev: Add dpif-netdev/pmd-stats-* appctl commands.

These commands can be used to get packets and cycles counters on a pmd
thread basis.  They're useful to get a clearer picture about the
performance of the userspace datapath.

They export these pieces of information:

- A (per-thread) view of the caches hit rate. Hits in the exact match
  cache are reported separately from hits in the masked classifier
- A rough cycles count. This will allow to estimate the load of OVS and
  the polling overhead.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2015-04-10 19:09:50 +01:00
+								    dpif_netdev_init,
-												dpif-netdev: allow for proper destruction of netdev datapaths

Until now, bridges with datapath_type=netdev did not destroy the datapath
when deleted. In particular, the tap device implementing the internal
interface was not close()d, and therefore the tap persists until
ovs-vswitchd exit()s.

This behaviour was caused by the missing callback for 'enumerate' in the
dpif-netdev class. Without this callback 'bridge_reconfigure' failed to
realize that there are datapaths with no bridge, and thus cannot destroy
them. Providing an 'enumerate' callback fixes this.

Signed-off-by: Giuseppe Lettieri <g.lettieri@iet.unipi.it>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-09 12:17:15 +02:00
+								    dpif_netdev_enumerate,
-												Add functions to determine how port should be opened based on type.

Depending on the port and type of datapath, a port may need to be opened
as a different type of device than it's configured.  For example, an
"internal" port on a "dummy" datapath should opened as a "dummy" port.
This commit adds the ability for a dpif to provide this information to a
caller.  It will be used in a future commit.

Signed-off-by: Justin Pettit <jpettit@nicira.com>

											
										
										
											2012-11-14 15:50:20 -08:00
+								    dpif_netdev_port_open_type,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_open,
 								    dpif_netdev_close,
-												Fix some regressions from the merge from master.

											
										
										
											2010-02-08 13:22:41 -05:00
+								    dpif_netdev_destroy,
-												dpif-netdev: Add poll-mode-device thread.

This patch adds PMD type netdev for netdevice with poll-mode
drivers.  Since there is no way to get signal on a packet recv
from these devices we need to poll them in busy loop.  So minimize
system call overhead this patch uses dpif-thread exclusively
for PMD devices and rest of devices which needs system calls to
do IO are moved to dpif-netdev-run().
PMD device like DPDK work in userspace so there is no system call
overhead for them.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Thomas Graf <tgraf@redhat.com>

											
										
										
											2014-03-20 10:57:41 -07:00
+								    dpif_netdev_run,
 								    dpif_netdev_wait,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_get_stats,
 								    dpif_netdev_port_add,
 								    dpif_netdev_port_del,
-												dpif-netdev: Introduce pmd-rxq-affinity.

New 'other_config:pmd-rxq-affinity' field for Interface table to
perform manual pinning of RX queues to desired cores.

This functionality is required to achieve maximum performance because
all kinds of ports have different cost of rx/tx operations and
only user can know about expected workload on different ports.

Example:
	# ./bin/ovs-vsctl set interface dpdk0 options:n_rxq=4 \
	                  other_config:pmd-rxq-affinity="0:3,1:7,3:8"
	Queue #0 pinned to core 3;
	Queue #1 pinned to core 7;
	Queue #2 not pinned.
	Queue #3 pinned to core 8;

It's decided to automatically isolate cores that have rxq explicitly
assigned to them because it's useful to keep constant polling rate on
some performance critical ports while adding/deleting other ports
without explicit pinning of all ports.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-07-27 17:44:44 +03:00
+								    dpif_netdev_port_set_config,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_port_query_by_number,
 								    dpif_netdev_port_query_by_name,
-												datapath: Move Netlink PID for userspace actions from flows to actions.

Commit b063d9f06 "datapath: Use unicast Netlink sockets for upcalls" that
switched from multicast to unicast Netlink for sending upcalls added a
Netlink PID to each kernel flow, used by OVS_ACTION_ATTR_USERSPACE actions
within the flow as target.

This commit drops this per-flow PID in favor of a per-action PID, because
that is more flexible.  It does not yet make use of this additional
flexibility, so behavior should not change.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Bug #7559.

											
										
										
											2011-10-12 16:24:54 -07:00
+								    NULL,                       /* port_get_pid */
-												datapath: Change listing ports to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to add new
features to the kernel vport layer without changing userspace software.  In
turn, that means that the odp_port structure must become variable-length.
This does not, however, fit in well with the ODP_PORT_LIST ioctl in its
current form, because that would require userspace to know how much space
to allocate for each port in advance, or to allocate as much space as
could possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_PORT_LIST
by a new ioctl ODP_VPORT_DUMP that retrieves information about a single
vport from the datapath on each call.  It is much cleaner to allocate the
maximum amount of space for a single vport than to do so for possibly a
large number of vports.

It would be faster to retrieve a number of vports in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

The Netlink version won't need to take the starting port number from
userspace, since Netlink sockets can keep track of that state as part
of their "dump" feature.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2011-01-10 13:12:12 -08:00
+								    dpif_netdev_port_dump_start,
 								    dpif_netdev_port_dump_next,
 								    dpif_netdev_port_dump_done,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								    dpif_netdev_port_poll,
 								    dpif_netdev_port_poll_wait,
 								    dpif_netdev_flow_flush,
-												dpif: Refactor flow dumping interface to make better sense for batching.

Commit a6ce4b9d251 (ofproto-dpif-upcall: Avoid use-after-free in
revalidate() corner case.) showed that it is somewhat tricky to correctly
use the existing dpif flow dumping interface to obtain batches of flows.
One has to be careful about calling dpif_flow_dump_next_may_destroy_keys()
before going on to the next flow.

A better interface is possible, one that is naturally oriented toward
retrieving batches when that is a useful optimization.  This commit
replaces the dpif interface by such a design, and updates both the
implementations and the callers to adopt it.

This is a fairly large change, but I think that the code in
ofproto-dpif-upcall is easier to understand after the change.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-05-20 11:37:02 -07:00
+								    dpif_netdev_flow_dump_create,
 								    dpif_netdev_flow_dump_destroy,
 								    dpif_netdev_flow_dump_thread_create,
 								    dpif_netdev_flow_dump_thread_destroy,
-												datapath: Change listing flows to use an iterator concept.

One of the goals for Open vSwitch is to decouple kernel and userspace
software, so that either one can be upgraded or rolled back independent of
the other.  To do this in full generality, it must be possible to change
the kernel's idea of the flow key separately from the userspace version.
In turn, that means that flow keys must become variable-length.  This does
not, however, fit in well with the ODP_FLOW_LIST ioctl in its current form,
because that would require userspace to know how much space to allocate
for each flow's key in advance, or to allocate as much space as could
possibly be needed.  Neither choice is very attractive.

This commit prepares for a different solution, by replacing ODP_FLOW_LIST
by a new ioctl ODP_FLOW_DUMP that retrieves a single flow from the datapath
on each call.  It is much cleaner to allocate the maximum amount of space
for a single flow key than to do so for possibly a very large number of
flow keys.

As a side effect, this patch also fixes a race condition that sometimes
made "ovs-dpctl dump-flows" print an error: previously, flows were listed
and then their actions were retrieved, which left a window in which
ovs-vswitchd could delete the flow.  Now dumping a flow and its actions is
a single step, closing that window.

Dumping all of the flows in a datapath is no longer an atomic step, so now
it is possible to miss some flows or see a single flow twice during
iteration, if the flow table is modified by another process.  It doesn't
look like this should be a problem for ovs-vswitchd.

It would be faster to retrieve a number of flows in batch instead of just
one at a time, but that will naturally happen later when the kernel
datapath interface is changed to use Netlink, so this patch does not bother
with it.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-12-28 10:39:52 -08:00
+								    dpif_netdev_flow_dump_next,
-												dpif-provider: Get rid of redundant operations.

The dpif provider 'operate' call duplicates all of the features available
from the 'flow_put', 'flow_del', and 'execute' calls, yielding redundant
code in providers that support both mechanisms.  This change drops the
latter calls in favor of making every dpif provider support 'operate'.
The result is code that is overall less duplicative.

It might make sense to do the same with flow_get but so far 'operate'
doesn't support flow_get.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-07-15 16:09:40 -07:00
+								    dpif_netdev_operate,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    NULL,                       /* recv_set */
 								    NULL,                       /* handlers_set */
-												dpif-netdev: Pass Openvswitch other_config smap to dpif.

Currently we parse the 'other_config' column in Openvswitch table in
bridge.c.  We extract the values (just 'pmd-cpu-mask' for now) and we
pass them down to the datapath, via different layers.

If we want to pass other values to dpif-netdev.c (like we recently
discussed) we would have to touch ofproto.c, ofproto-dpif.c and dpif.c.

This patch sends the entire other_config column to dpif-netdev, so that
dpif-netdev can extract the values it's interested in.

No functional change.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2017-01-27 16:41:36 -08:00
+								    dpif_netdev_set_config,
-												dpif-netdev: Allow enqueue actions.

The dpif-netdev implementation disallowed enqueue actions because
it did not support conversion from OVS 'queue_id' to dpif
'priority'.  For testing purposes, this patch allows queues which
translate into NOOPs.

											
										
										
											2011-11-21 13:36:17 -08:00
+								    dpif_netdev_queue_to_priority,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    NULL,                       /* recv */
 								    NULL,                       /* recv_wait */
 								    NULL,                       /* recv_purge */
-												dpif-netdev: Purge all ukeys when reconfigure pmd.

When dpdk configuration changes, all pmd threads are recreated
and rx queues of each port are reloaded.  After this process,
rx queue could be mapped to a different pmd thread other than
the one before reconfiguration.  However, this is totally
transparent to ofproto layer modules.  So, if the ofproto-dpif-upcall
module still holds ukeys generated before pmd thread recreation,
this old ukey will collide with the ukey for the new upcalls
from same traffic flow, causing flow installation failure.

To fix the bug, this commit adds a new call-back function
in dpif layer for notifying upper layer the purging of datapath
(e.g. pmd thread deletion in dpif-netdev).  So, the
ofproto-dpif-upcall module can react properly with deleting
the ukeys and with collecting flows' last stats.

Reported-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Alex Wang <ee07b291@gmail.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2015-08-25 16:36:46 -07:00
+								    dpif_netdev_register_dp_purge_cb,
-												dpif-netdev: Polling threads directly call ofproto upcall functions.

Typically, kernel datapath threads send upcalls to userspace where
handler threads process the upcalls. For TAP and DPDK devices, the
datapath threads operate in userspace, so there is no need for
separate handler threads.

This patch allows userspace datapath threads to directly call the
ofproto upcall functions, eliminating the need for handler threads
for datapaths of type 'netdev'.

Signed-off-by: Ryan Wilson <wryan@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2014-07-26 06:51:55 +00:00
+								    dpif_netdev_register_upcall_cb,
 								    dpif_netdev_enable_upcall,
 								    dpif_netdev_disable_upcall,
-												bridge: Store datapath version into ovsdb

OVS userspace are backward compatible with older Linux kernel modules.
However, not having the most up-to-date datapath kernel modules can
some times lead to user confusion. Storing the datapath version in
OVSDB allows management software to check and optionally provide
notifications to users.

Signed-off-by: Andy Zhou <azhou@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-10-16 15:23:11 -07:00
+								    dpif_netdev_get_datapath_version,
-												dpif-netdev: Implement conntrack dump functions.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    dpif_netdev_ct_dump_start,
 								    dpif_netdev_ct_dump_next,
 								    dpif_netdev_ct_dump_done,
-												dpif-netdev: Implement conntrack flush interface.

New functions are implemented in the conntrack module to support this.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>

											
										
										
											2015-11-15 22:07:25 -08:00
+								    dpif_netdev_ct_flush,
-												dpctl conntrack: Add get and set maxconns command.

Get and set dpctl commands are added for conntrack maxconns.
These commands are only supported in the userspace
datapath at this time.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-01-08 15:18:42 -08:00
+								    dpif_netdev_ct_set_maxconns,
 								    dpif_netdev_ct_get_maxconns,
-												dpctl conntrack: Add get number of connections.

A get command is added for number of conntrack connections.
This command is only supported in the userspace datapath
at this time.

Signed-off-by: Darrell Ball <dlu998@gmail.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2018-01-08 15:18:43 -08:00
+								    dpif_netdev_ct_get_nconns,
-												dpif: Support conntrack zone limit.

This patch defines the dpif interface to support conntrack
per zone limit.  Basically, OVS users can use this interface
to set, delete, and get the conntrack per zone limit for various
dpif interfaces.  The following patch will make use of the proposed
interface to implement the feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Justin Pettit <jpettit@ovn.org>

											
										
										
											2018-08-17 02:05:07 -07:00
+								    NULL,                       /* ct_set_limits */
 								    NULL,                       /* ct_get_limits */
 								    NULL,                       /* ct_del_limits */
-												dpif: Meter framework.

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Andy Zhou <azhou@ovn.org>

											
										
										
											2017-02-23 11:27:54 -08:00
+								    dpif_netdev_meter_get_features,
 								    dpif_netdev_meter_set,
 								    dpif_netdev_meter_get,
 								    dpif_netdev_meter_del,
-												New implementation of userspace datapath, based on the netdev library.

											
										
										
											2009-06-19 14:09:39 -07:00
+								};
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								static void
 								dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
 								                              const char *argv[], void *aux OVS_UNUSED)
 								{
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    struct dp_netdev_port *port;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    struct dp_netdev *dp;
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    odp_port_t port_no;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    ovs_mutex_lock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    dp = shash_find_data(&dp_netdevs, argv[1]);
 								    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        ovs_mutex_unlock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
 								        return;
 								    }
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    ovs_refcount_ref(&dp->ref_cnt);
 								    ovs_mutex_unlock(&dp_netdev_mutex);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_lock(&dp->port_mutex);
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    if (get_port_by_name(dp, argv[2], &port)) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "unknown port");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    port_no = u32_to_odp(atoi(argv[3]));
 								    if (!port_no || port_no == ODPP_NONE) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "bad port number");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use hmap instead of list+array for tracking ports.

The goal is to make it easy to divide the ports into groups for handling
by threads.  It seems easy enough to do that by hash value, and a little
harder otherwise.

This commit has the side effect of raising the maximum number of ports from
256 to UINT32_MAX-1.  That is why some tests need to be updated:
previously, internally generated port names like "ovs_vxlan_4341" were
ignored because 4341 is bigger than the previous limit of 256.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-12-24 16:08:57 -08:00
+								    if (dp_netdev_lookup_port(dp, port_no)) {
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								        unixctl_command_reply_error(conn, "port number already in use");
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								        goto exit;
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    }
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    /* Remove port. */
 								    hmap_remove(&dp->ports, &port->node);
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    reconfigure_datapath(dp);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Use hmap for ports.

netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performance
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Tested-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-04-04 11:15:12 -07:00
+								    /* Reinsert with new port number. */
 								    port->port_no = port_no;
 								    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
-												dpif-netdev: Centralized threads and queues handling code.

Currently we have three different code paths that deal with pmd threads
and queues, in response to different input

1. When a port is added
2. When a port is deleted
3. When the cpumask changes or a port must be reconfigured.

1. and 2. are carefully written to minimize disruption to the running
datapath, while 3. brings down all the threads reconfigure all the ports
and restarts everything.

This commit removes the three separate code paths by introducing the
reconfigure_datapath() function, that takes care of adapting the pmd
threads and queues to the current datapath configuration, no matter how
we got there.

This aims at simplifying maintenance and introduces a long overdue
improvement: port reconfiguration (can happen quite frequently for
dpdkvhost ports) is now done without shutting down the whole datapath,
but just by temporarily removing the port that needs to be reconfigured
(while the rest of the datapath is running).

We now also recompute the rxq scheduling from scratch every time a port
is added of deleted.  This means that the queues will be more balanced,
especially when dealing with explicit rxq-affinity from the user
(without shutting down the threads and restarting them), but it also
means that adding or deleting a port might cause existing queues to be
moved between pmd threads.  This negative effect can be avoided by
taking into account the existing distribution when computing the new
scheduling, but I considered code clarity and fast reconfiguration more
important than optimizing port addition or removal (a port is added and
removed only once, but can be reconfigured many times)

Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
the pmd threads state is kept only in dpif-netdev.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Co-authored-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Ilya Maximets <i.maximets@samsung.com>

											
										
										
											2016-11-15 15:40:49 -08:00
+								    reconfigure_datapath(dp);
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
-												dpif-netdev: Avoid races on queue and port changes using seq objects.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-08-07 13:29:54 -07:00
+								    seq_change(dp->port_seq);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								    unixctl_command_reply(conn, NULL);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
 								exit:
-												dpif-netdev: Use cmap for ports.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>

											
										
										
											2014-05-20 13:21:09 -07:00
+								    ovs_mutex_unlock(&dp->port_mutex);
-												dpif-netdev: Make thread-safety much more granular.

This will allow for parallelism in multithreaded forwarding in an upcoming
commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-08 15:58:11 -08:00
+								    dp_netdev_unref(dp);
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+								}
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								static void
 								dpif_dummy_register__(const char *type)
 								{
 								    struct dpif_class *class;
 								    class = xmalloc(sizeof *class);
 								    *class = dpif_netdev_class;
 								    class->type = xstrdup(type);
 								    dp_register_provider(class);
 								}
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								static void
 								dpif_dummy_override(const char *type)
 								{
-												dpif_dummy_override: Allow overriding a non-existing provider

This allows --enable-dummy=system with a userland-only build.
It's useful for testsuite.

Signed-off-by: YAMAMOTO Takashi <yamamoto@midokura.com>
Acked-by: Ben Pfaff <blp@ovn.org>

											
										
										
											2015-10-14 17:39:40 +00:00
+								    int error;
 								    /*
 								     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
 								     * a userland-only build.  It's useful for testsuite.
 								     */
 								    error = dp_unregister_provider(type);
 								    if (error == 0 || error == EAFNOSUPPORT) {
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								        dpif_dummy_register__(type);
 								    }
 								}
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								void
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								dpif_dummy_register(enum dummy_level level)
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								{
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								    if (level == DUMMY_OVERRIDE_ALL) {
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								        struct sset types;
 								        const char *type;
 								        sset_init(&types);
 								        dp_enumerate_types(&types);
 								        SSET_FOR_EACH (type, &types) {
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								            dpif_dummy_override(type);
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
+								        }
 								        sset_destroy(&types);
-												dummy: Introduce new --enable-dummy=system option.

Until now there have been two variants for --enable-dummy:

    * --enable-dummy: This adds support for "dummy" dpif and netdev.

    * --enable-dummy=override: In addition, this replaces *every* existing
      dpif and netdev by the dummy type.

The latter is useful for testing but it defeats the possibility of using
the userspace native tunneling implementation (because all the tunnel
netdevs get replaced by dummy netdevs).  Thus, this commit adds a third
variant:

    * --enable-dummy=system: This replaces the "system" dpif and netdev
      by dummies but leaves the others untouched.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2015-06-13 16:58:49 -07:00
+								    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
 								        dpif_dummy_override("system");
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								    }
-												dummy: Make --enable-dummy=override replace all dpifs, netdevs by dummies.

Plain "--enable-dummy" just creates new dummy dpif and netdev classes.
This commit makes "--enable-dummy=override" go a step farther and actually
delete and replace all the existing dpif and netdev classes by copies of
the dummy class.

This is useful for testing in an environment where changing the classes in
Bridge or Interface records is challenging.

Requested-by: Andrew Lambeth <wal@nicira.com>
Tested-by: Andrew Lambeth <wal@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-01-19 10:24:46 -08:00
 								    dpif_dummy_register__("dummy");
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
 								    unixctl_command_register("dpif-dummy/change-port-number",
-												unixctl: Make command description all lowercase.

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-08-22 16:27:22 -07:00
+								                             "dp port new-number",
-												ofproto-dpif: Tolerate spontaneous changes in datapath port numbers.

This can happen on ESX.

Also adds a test to make sure this works.

Bug #17634.
Signed-off-by: Ben Pfaff <blp@nicira.com>
Tested-by: Guolin Yang <gyang@vmware.com>

											
										
										
											2013-07-29 15:11:49 -07:00
+, 3, dpif_dummy_change_port_number, NULL);
-												Add new "dummy" netdev and dpif implementations for use in unit tests.

											
										
										
											2010-11-29 12:21:08 -08:00
+								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								/* Datapath Classifier. */
 								/* A set of rules that all have the same fields wildcarded. */
 								struct dpcls_subtable {
 								    /* The fields are only used by writers. */
 								    struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
 								    /* These fields are accessed by readers. */
 								    struct cmap rules;           /* Contains "struct dpcls_rule"s. */
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    uint32_t hit_cnt;            /* Number of match hits in subtable in current
 								                                    optimization interval. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    struct netdev_flow_key mask; /* Wildcards for fields (const). */
 								    /* 'mask' must be the last field, additional space is allocated here. */
 								};
 								/* Initializes 'cls' as a classifier that initially contains no classification
 								 * rules. */
 								static void
 								dpcls_init(struct dpcls *cls)
 								{
 								    cmap_init(&cls->subtables_map);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_init(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								}
 								static void
 								dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
 								{
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_remove(&cls->subtables, subtable);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
 								                subtable->mask.hash);
 								    cmap_destroy(&subtable->rules);
 								    ovsrcu_postpone(free, subtable);
 								}
 								/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
 								 * caller's responsibility.
 								 * May only be called after all the readers have been terminated. */
 								static void
 								dpcls_destroy(struct dpcls *cls)
 								{
 								    if (cls) {
 								        struct dpcls_subtable *subtable;
 								        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								            ovs_assert(cmap_count(&subtable->rules) == 0);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            dpcls_destroy_subtable(cls, subtable);
 								        }
 								        cmap_destroy(&cls->subtables_map);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								        pvector_destroy(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								}
 								static struct dpcls_subtable *
 								dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable;
 								    /* Need to add one. */
-												lib/dpif-netdev: Fix EMC lookup.

Patch 0de8783a9 (lib/dpif-netdev: Integrate megaflow classifier.)
broke exact match cache lookup, but it went undetected since there are
no separate stats for EMC.

This patch fixes the problem by changing the struct netdev_flow_key
'len' member to cover only the 'mf' member, not the whole
netdev_flow_key, and ignoring the 'len' field in
netdev_flow_key_equal.  Comparison is still accurate, as the miniflow
'map' field encodes the length in the number of 1-bits, and the map is
included in the comparison.

Reported-by: Alex Wang <alexw@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Daniele Di Proietto <ddiproietto@vmware.com>

											
										
										
											2014-10-17 17:03:13 -07:00
+								    subtable = xmalloc(sizeof *subtable
 								                       - sizeof subtable->mask.mf + mask->len);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    cmap_init(&subtable->rules);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    subtable->hit_cnt = 0;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    netdev_flow_key_clone(&subtable->mask, mask);
 								    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Add the new subtable at the end of the pvector (with no hits yet) */
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_insert(&cls->subtables, subtable, 0);
-												dpif-netdev: Fix -Wformat warning on 32-bit build.

Use the appropriate format specifier for size_t, otherwise the 32-bit
build fails.

Reported-at: https://travis-ci.org/openvswitch/ovs/jobs/151938383
Fixes: 3453b4d62a98("dpif-netdev: dpcls per in_port with sorted
subtables")
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Joe Stringer <joe@ovn.org>

											
										
										
											2016-08-12 15:38:50 -07:00
+								    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								             cmap_count(&cls->subtables_map), subtable, cls->in_port);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    pvector_publish(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    return subtable;
 								}
 								static inline struct dpcls_subtable *
 								dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable;
 								    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
 								                             &cls->subtables_map) {
 								        if (netdev_flow_key_equal(&subtable->mask, mask)) {
 								            return subtable;
 								        }
 								    }
 								    return dpcls_create_subtable(cls, mask);
 								}
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
 								/* Periodically sort the dpcls subtable vectors according to hit counts */
 								static void
 								dpcls_sort_subtable_vector(struct dpcls *cls)
 								{
 								    struct pvector *pvec = &cls->subtables;
 								    struct dpcls_subtable *subtable;
 								    PVECTOR_FOR_EACH (subtable, pvec) {
 								        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
 								        subtable->hit_cnt = 0;
 								    }
 								    pvector_publish(pvec);
 								}
 								static inline void
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 								                           struct polled_queue *poll_list, int poll_cnt)
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								{
 								    struct dpcls *cls;
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								    uint64_t tot_idle = 0, tot_proc = 0;
 								    unsigned int pmd_load = 0;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								        uint64_t curr_tsc;
-												Adding support for PMD auto load balancing

Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Tested-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2019-01-16 05:41:43 +00:00
+								        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
 								        if (pmd_alb->is_enabled && !pmd->isolated
 								            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
 								                                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
 								            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
 								                                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
 								            {
 								            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
 								                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
 								            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
 								                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
 								            if (tot_proc) {
 								                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
 								            }
 								            if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
 								                atomic_count_inc(&pmd->pmd_overloaded);
 								            } else {
 								                atomic_count_set(&pmd->pmd_overloaded, 0);
 								            }
 								        }
 								        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
 								                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
 								        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
 								                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								        /* Get the cycles that were used to process each queue and store. */
 								        for (unsigned i = 0; i < poll_cnt; i++) {
 								            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
 								                                                        RXQ_CYCLES_PROC_CURR);
 								            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
 								            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
 );
 								        }
-												dpif-netdev: Add percentage of pmd/core used by each rxq.

It is based on the length of history that is stored about an
rxq (currently 1 min).

$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
        isolated : false
        port: dpdkphy1         queue-id:  0    pmd usage: 70 %
        port: dpdkvhost0       queue-id:  0    pmd usage:  0 %
pmd thread numa_id 0 core_id 6:
        isolated : false
        port: dpdkphy0         queue-id:  0    pmd usage: 64 %
        port: dpdkvhost1       queue-id:  0    pmd usage:  0 %

These values are what would be used as part of rxq to pmd
assignment due to a reconfiguration event e.g. adding pmds,
adding rxqs or with the command:

ovs-appctl dpif-netdev/pmd-rxq-rebalance

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-01-16 15:04:41 +00:00
+								        curr_tsc = cycles_counter_update(&pmd->perf_stats);
 								        if (pmd->intrvl_tsc_prev) {
 								            /* There is a prev timestamp, store a new intrvl cycle count. */
 								            atomic_store_relaxed(&pmd->intrvl_cycles,
 								                                 curr_tsc - pmd->intrvl_tsc_prev);
 								        }
 								        pmd->intrvl_tsc_prev = curr_tsc;
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								        /* Start new measuring interval */
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								        pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
-												dpif-netdev: Count the rxq processing cycles for an rxq.

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>

											
										
										
											2017-08-25 00:44:25 -07:00
+								    }
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								    if (pmd->ctx.now > pmd->next_optimization) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        /* Try to obtain the flow lock to block out revalidator threads.
 								         * If not possible, just try next time. */
 								        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
 								            /* Optimize each classifier */
 								            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
 								                dpcls_sort_subtable_vector(cls);
 								            }
 								            ovs_mutex_unlock(&pmd->flow_mutex);
 								            /* Start new measuring interval */
-												dpif-netdev: Keep latest measured time for PMD thread.

In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2017-12-14 14:59:23 +03:00
+								            pmd->next_optimization = pmd->ctx.now
 								                                     + DPCLS_OPTIMIZATION_INTERVAL;
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        }
 								    }
 								}
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								/* Insert 'rule' into 'cls'. */
 								static void
 								dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
 								             const struct netdev_flow_key *mask)
 								{
 								    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Refer to subtable's mask, also for later removal. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    rule->mask = &subtable->mask;
 								    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
 								}
 								/* Removes 'rule' from 'cls', also destructing the 'rule'. */
 								static void
 								dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 								{
 								    struct dpcls_subtable *subtable;
 								    ovs_assert(rule->mask);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    /* Get subtable from reference in rule->mask. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    INIT_CONTAINER(subtable, rule->mask, mask);
 								    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
 								        == 0) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        /* Delete empty subtable. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        dpcls_destroy_subtable(cls, subtable);
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								        pvector_publish(&cls->subtables);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								}
-												flow: Split miniflow's map.

Use two maps in miniflow to allow for expansion of struct flow past
512 bytes.  We now have one map for tunnel related fields, and another
for the rest of the packet metadata and actual packet header fields.
This split has the benefit that for non-tunneled packets the overhead
should be minimal.

Some miniflow utilities now exist in two variants, new ones operating
over all the data, and the old ones operating only on a single 64-bit
map at a time.  The old ones require doubling of code but should
execute faster, so those are used in the datapath and classifier's
lookup path.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-17 15:18:43 -07:00
+								/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
 								 * in 'mask' the values in 'key' and 'target' are the same. */
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								static bool
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								dpcls_rule_matches_key(const struct dpcls_rule *rule,
 								                       const struct netdev_flow_key *target)
 								{
-												flow: Make compile with MSVC.

MSVC does not like zero sized arrays in structs.  Hence, remove the
'values' member from struct miniflow and add back the getters
miniflow_values() and miniflow_get_values().

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2015-07-16 17:42:24 -07:00
+								    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
 								    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    uint64_t value;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
-												flow: Add struct flowmap.

Struct miniflow is now sometimes used just as a map.  Define a new
struct flowmap for that purpose.  The flowmap is defined as an array of
maps, and it is automatically sized according to the size of struct
flow, so it will be easier to maintain in the future.

It would have been tempting to use the existing struct bitmap for this
purpose. The main reason this is not feasible at the moment is that
some flowmap algorithms are simpler when it can be assumed that no
struct flow member requires more bits than can fit to a single map
unit. The tunnel member already requires more than 32 bits, so the map
unit needs to be 64 bits wide.

Performance critical algorithms enumerate the flowmap array units
explicitly, as it is easier for the compiler to optimize, compared to
the normal iterator.  Without this optimization a classifier lookup
without wildcard masks would be about 25% slower.

With this more general (and maintainable) algorithm the classifier
lookups are about 5% slower, when the struct flow actually becomes big
enough to require a second map.  This negates the performance gained
in the "Pre-compute stage masks" patch earlier in the series.

Requested-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
											
										
										
											2015-08-25 13:55:03 -07:00
+								    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
 								        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            return false;
 								        }
 								    }
 								    return true;
 								}
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								/* For each miniflow in 'keys' performs a classifier lookup writing the result
 								 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								 * NULL it is skipped.
 								 *
 								 * This function is optimized for use in the userspace datapath and therefore
 								 * does not implement a lot of features available in the standard
 								 * classifier_lookup() function.  Specifically, it does not implement
 								 * priorities, instead returning any rule which matches the flow.
 								 *
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								 * Returns true if all miniflows found a corresponding rule. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								static bool
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								             struct dpcls_rule **rules, const size_t cnt,
 								             int *num_lookups_p)
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								{
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								    /* The received 'cnt' miniflows are the search-keys that will be processed
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								     * to find a matching entry into the available subtables.
 								     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
 								    typedef uint32_t map_type;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
 								    struct dpcls_subtable *subtable;
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								    map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
 								    map_type found_map;
 								    uint32_t hashes[MAP_BITS];
 								    const struct cmap_node *nodes[MAP_BITS];
 								    if (cnt != MAP_BITS) {
 								        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								    memset(rules, 0, cnt * sizeof *rules);
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								    int lookups_match = 0, subtable_pos = 1;
-												dpcls_lookup: added comments.

This patch adds some comments to the dpcls_lookup() funtion,
which is one of the most important places where the Userspace
wildcard matching happens.
The purpose is to give some more explanations on its design
and also on how it works.

Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
											
										
										
											2016-08-05 14:40:03 +01:00
+								    /* The Datapath classifier - aka dpcls - is composed of subtables.
 								     * Subtables are dynamically created as needed when new rules are inserted.
 								     * Each subtable collects rules with matches on a specific subset of packet
 								     * fields as defined by the subtable's mask.  We proceed to process every
 								     * search-key against each subtable, but when a match is found for a
 								     * search-key, the search for that key can stop because the rules are
 								     * non-overlapping. */
-												Revert "pvector: Expose non-concurrent priority vector."

This reverts commit 8bdfe1313894047d44349fa4cf4402970865950f.

I failed to see that lib/dpif-netdev.c actually needs the concurrency
provided by pvector prior to this change.  More specifically, when a
subtable is removed, concurrent lookups may skip over another subtable
swapped in to the place of the removed subtable in the vector.

Since this was the only use of the non-concurrent pvector, it is
cleaner to revert the whole patch.

Reported-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
											
										
										
											2016-08-10 14:58:51 -07:00
+								    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								        int i;
 								        /* Compute hashes for the remaining keys.  Each search-key is
 								         * masked with the subtable's mask to avoid hashing the wildcarded
 								         * bits. */
 								        ULLONG_FOR_EACH_1(i, keys_map) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								            hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								                                                     &subtable->mask);
 								        }
 								        /* Lookup. */
 								        found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
 								        /* Check results.  When the i-th bit of found_map is set, it means
 								         * that a set of nodes with a matching hash value was found for the
 								         * i-th search-key.  Due to possible hash collisions we need to check
 								         * which of the found rules, if any, really matches our masked
 								         * search-key. */
 								        ULLONG_FOR_EACH_1(i, found_map) {
 								            struct dpcls_rule *rule;
 								            CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
-												dpif-netdev: Add SMC cache after EMC cache

This patch adds a signature match cache (SMC) after exact match
cache (EMC). The difference between SMC and EMC is SMC only stores
a signature of a flow thus it is much more memory efficient. With
same memory space, EMC can store 8k flows while SMC can store 1M
flows. It is generally beneficial to turn on SMC but turn off EMC
when traffic flow count is much larger than EMC size.

SMC cache will map a signature to an dp_netdev_flow index in
flow_table. Thus, we add two new APIs in cmap for lookup key by
index and lookup index by key.

For now, SMC is an experimental feature that it is turned off by
default. One can turn it on using ovsdb options.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Co-authored-by: Jan Scheurich <jan.scheurich@ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>

											
										
										
											2018-07-10 03:14:06 -07:00
+								                if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								                    rules[i] = rule;
 								                    /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
 								                     * within one second optimization interval. */
 								                    subtable->hit_cnt++;
 								                    lookups_match += subtable_pos;
 								                    goto next;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								                }
 								            }
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								            /* None of the found rules was a match.  Reset the i-th bit to
 								             * keep searching this key in the next subtable. */
 								            ULLONG_SET0(found_map, i);  /* Did not match. */
 								        next:
 								            ;                     /* Keep Sparse happy. */
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								        }
-												dpcls: Use 32 packet batches for lookups.

This patch increases the number of packets processed in a batch during a
lookup from 16 to 32. Processing batches of 32 packets improves
performance and also one of the internal loops can be avoided here.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Acked-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-10-20 21:45:16 +01:00
+								        keys_map &= ~found_map;             /* Clear the found rules. */
 								        if (!keys_map) {
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								            if (num_lookups_p) {
 								                *num_lookups_p = lookups_match;
 								            }
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								            return true;              /* All found. */
 								        }
-												dpif-netdev: dpcls per in_port with sorted subtables

The user-space datapath (dpif-netdev) consists of a first level "exact match
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With
many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient
and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables
(aka subtables), each containing megaflow entries with the same mask of
packet header and metadata fields to match upon. A dpcls lookup matches a
given packet against all subtables in sequence until it hits a match. As
megaflow cache entries are by construction non-overlapping, the first match
is the only match.

Today the order of the subtables in the dpcls is essentially random so that
on average a dpcls lookup has to visit N/2 subtables for a hit, when N is the
total number of subtables. Even though every single hash-table lookup is
fast, the performance of the current dpcls degrades when there are many
subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a
small subset of subtables that have hits. The entire megaflow cache typically
decomposes nicely into partitions that are hit only by packets entering from
a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM ->
Phy).

Therefore, maintaining a separate dpcls instance per ingress port with its
subtable vector sorted by frequency of hits reduces the average number of
subtables lookups in the dpcls to a minimum, even if the total number of
subtables gets large. This is possible because megaflows always have an exact
match on in_port, so every megaflow belongs to unique dpcls instance.

For thread safety, the PMD thread needs to block out revalidators during the
periodic optimization. We use ovs_mutex_trylock() to avoid blocking the PMD.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups
per hit" to report the average number of subtable lookup needed for a
megaflow match. Ideally, this should be close to 1 and almost all cases much
smaller than N/2.

The PMD tests have been adjusted to the additional line in pmd-stats-show.

We have benchmarked a L3-VPN pipeline on top of a VXLAN overlay mesh.
With pure L3 tenant traffic between VMs on different nodes the resulting
netdev dpcls contains N=4 subtables. Each packet traversing the OVS
datapath is subject to dpcls lookup twice due to the tunnel termination.

Disabling the EMC, we have measured a baseline performance (in+out) of ~1.45
Mpps (64 bytes, 10K L4 packet flows). The average number of subtable lookups
per dpcls match is 2.5. With the patch the average number of subtable lookups
per dpcls match is reduced to 1 and the forwarding performance grows by ~50%
to 2.13 Mpps.

Even with EMC enabled, the patch improves the performance by 9% (for 1000 L4
flows) and 34% (for 50K+ L4 flows).

As the actual number of subtables will often be higher in reality, we can
assume that this is at the lower end of the speed-up one can expect from this
optimization. Just running a parallel ping between the VXLAN tunnel endpoints
increases the number of subtables and hence the average number of subtable
lookups from 2.5 to 3.5 on master with a corresponding decrease of throughput
to 1.2 Mpps. With the patch the parallel ping has no impact on average number
of subtable lookups and performance. The performance gain is then ~75%.

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>

											
										
										
											2016-08-11 12:02:27 +02:00
+								        subtable_pos++;
 								    }
 								    if (num_lookups_p) {
 								        *num_lookups_p = lookups_match;
-												lib/dpif-netdev: Integrate megaflow classifier.

Megaflow inserts and removals are simplified:

- No need for classifier internal mutex, as dpif-netdev already has a
  'flow_mutex'.
- Number of memory allocations/frees can be halved.
- Lookup code path can rely on netdev_flow_key always having inline data.

This will also be easier to simplify further when moving to per-thread
megaflow classifiers in the future.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Alex Wang <alexw@nicira.com>

											
										
										
											2014-10-17 09:37:11 -07:00
+								    }
 								    return false;                     /* Some misses. */
 								}