2
0
mirror of https://github.com/openvswitch/ovs synced 2025-10-29 15:28:56 +00:00

util: New function popcount().

This is the fastest portable implementation among the ones below, as
measured with GCC 4.4 on a Xeon X3430.  The measeured times were, in
seconds:

popcount1    25.6
popcount2     6.9 (but is not portable)
popcount3    31.4
popcount4    25.6
popcount5    61.6 (and is buggy)
popcount6    64.6
popcount7    32.3
popcount8    11.2

int
popcount1(unsigned int x)
{
    return __builtin_popcount(x);
}

int
popcount2(unsigned int x)
{
    unsigned int y;
    asm("popcnt %1, %0" : "=r" (y) : "g" (x));
    return y;
}

int
popcount3(unsigned int x)
{
    unsigned int n;

    n = (x >> 1) & 033333333333;
    x -= n;
    n = (n >> 1) & 033333333333;
    x -= n;
    x = (x + (x >> 3)) & 030707070707;
    return x % 63;
}

int
popcount4(unsigned int x)
{
    x -= (x >> 1) & 0x55555555;
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0f0f0f0f;
    x += x >> 8;
    x += x >> 16;
    return x & 0x3f;
}

int
popcount5(unsigned int x)
{
    int n;

    n = 0;
    while (x) {
        if (x & 0xf) {
            n += ((0xe9949440 >> (x & 0xf)) & 3) + 1;
        }
        x >>= 4;
    }
    return n;
}

int
popcount6(unsigned int x)
{
    int n;

    n = 0;
    while (x) {
        n += (0xe994 >> (x & 7)) & 3;
        x >>= 3;
    }
    return n;
}

int
popcount7(unsigned int x)
{
    static const int table[16] = {
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
    };

    return (table[x & 0xf]
            + table[(x >> 4) & 0xf]
            + table[(x >> 8) & 0xf]
            + table[(x >> 12) & 0xf]
            + table[(x >> 16) & 0xf]
            + table[(x >> 20) & 0xf]
            + table[(x >> 24) & 0xf]
            + table[x >> 28]);
}

static int
popcount8(unsigned int x)
{
    ((((X) & (1 << 0)) != 0) +                  \
     (((X) & (1 << 1)) != 0) +                  \
     (((X) & (1 << 2)) != 0) +                  \
     (((X) & (1 << 3)) != 0) +                  \
     (((X) & (1 << 4)) != 0) +                  \
     (((X) & (1 << 5)) != 0) +                  \
     (((X) & (1 << 6)) != 0) +                  \
     (((X) & (1 << 7)) != 0))

    static const uint8_t popcount8[256] = {
        INIT64(0), INIT64(64), INIT64(128), INIT64(192)
    };

    return (popcount8[x & 0xff] +
            popcount8[(x >> 8) & 0xff] +
            popcount8[(x >> 16) & 0xff] +
            popcount8[x >> 24]);
}

int
main(void)
{
    unsigned long long int x;
    int n;

    n = 0;
    for (x = 0; x <= UINT32_MAX; x++) {
        n += popcount8(x);
    }
    printf("%d\n", n);

    return 0;
}

Signed-off-by: Ben Pfaff <blp@nicira.com>
This commit is contained in:
Ben Pfaff
2012-07-20 12:38:59 -07:00
parent 8472a3cecc
commit a656cb7736
4 changed files with 95 additions and 0 deletions

View File

@@ -823,6 +823,40 @@ ctz(uint32_t n)
}
}
/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
int
popcount(uint32_t x)
{
/* In my testing, this implementation is over twice as fast as any other
* portable implementation that I tried, including GCC 4.4
* __builtin_popcount(), although nonportable asm("popcnt") was over 50%
* faster. */
#define INIT1(X) \
((((X) & (1 << 0)) != 0) + \
(((X) & (1 << 1)) != 0) + \
(((X) & (1 << 2)) != 0) + \
(((X) & (1 << 3)) != 0) + \
(((X) & (1 << 4)) != 0) + \
(((X) & (1 << 5)) != 0) + \
(((X) & (1 << 6)) != 0) + \
(((X) & (1 << 7)) != 0))
#define INIT2(X) INIT1(X), INIT1((X) + 1)
#define INIT4(X) INIT2(X), INIT2((X) + 2)
#define INIT8(X) INIT4(X), INIT4((X) + 4)
#define INIT16(X) INIT8(X), INIT8((X) + 8)
#define INIT32(X) INIT16(X), INIT16((X) + 16)
#define INIT64(X) INIT32(X), INIT32((X) + 32)
static const uint8_t popcount8[256] = {
INIT64(0), INIT64(64), INIT64(128), INIT64(192)
};
return (popcount8[x & 0xff] +
popcount8[(x >> 8) & 0xff] +
popcount8[(x >> 16) & 0xff] +
popcount8[x >> 24]);
}
/* Returns true if the 'n' bytes starting at 'p' are zeros. */
bool
is_all_zeros(const uint8_t *p, size_t n)