ovs/lib/unicode.c

/*
 * Copyright (c) 2009, 2010 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>

#include "unicode.h"

#include <inttypes.h>

#include "openvswitch/dynamic-string.h"
#include "util.h"

/* Returns the unicode code point corresponding to leading surrogate 'leading'
 * and trailing surrogate 'trailing'.  The return value will not make any
 * sense if 'leading' or 'trailing' are not in the correct ranges for leading
 * or trailing surrogates. */
int
utf16_decode_surrogate_pair(int leading, int trailing)
{
    /*
     *  Leading surrogate:         110110wwwwxxxxxx
     * Trailing surrogate:         110111xxxxxxxxxx
     *         Code point: 000uuuuuxxxxxxxxxxxxxxxx
     */
    int w = (leading >> 6) & 0xf;
    int u = w + 1;
    int x0 = leading & 0x3f;
    int x1 = trailing & 0x3ff;
    return (u << 16) | (x0 << 10) | x1;
}

/* Returns the number of Unicode characters in UTF-8 string 's'. */
size_t
utf8_length(const char *s_)
{
    const uint8_t *s;
    size_t length;

    length = 0;
    for (s = (const uint8_t *) s_; *s != '\0'; s++) {
        /* The most-significant bits of the first byte in a character are one
         * of 2#01, 2#00, or 2#11.  2#10 is a continuation byte. */
        length += (*s & 0xc0) != 0x80;
    }
    return length;
}

static char *
invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
{
    struct ds msg;
    int i;

    if (lengthp) {
        *lengthp = 0;
    }

    ds_init(&msg);
    ds_put_cstr(&msg, "invalid UTF-8 sequence");
    for (i = 0; i < n; i++) {
        ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
    }
    return ds_steal_cstr(&msg);
}

struct utf8_sequence {
    uint8_t octets[5][2];
};

static const struct utf8_sequence *
lookup_utf8_sequence(uint8_t c)
{
    static const struct utf8_sequence seqs[] = {
        { { { 0x01, 0x7f },
            { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },

        { { { 0xc2, 0xdf }, { 0x80, 0xbf },
            { 0, 0 }, { 0, 0 }, { 0, 0 } } },

        { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
            {0,0}, {0, 0 } } },

        { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
            { 0, 0 }, { 0, 0 } } },

        { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
            { 0, 0 }, { 0, 0 } } },

        { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
            { 0, 0 }, { 0, 0 } } },

        { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
            { 0, 0 } } },

        { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
            { 0, 0 } } },

        { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
            { 0, 0 } } },
    };

    size_t i;

    for (i = 0; i < ARRAY_SIZE(seqs); i++) {
        const uint8_t *o = seqs[i].octets[0];
        if (c >= o[0] && c <= o[1]) {
            return &seqs[i];
        }
    }
    return NULL;
}

/* Checks that 's' is a valid, null-terminated UTF-8 string.  If so, returns a
 * null pointer and sets '*lengthp' to the number of Unicode characters in
 * 's'.  If not, returns an error message that the caller must free and sets
 * '*lengthp' to 0.
 *
 * 'lengthp' may be NULL if the length is not needed. */
char *
utf8_validate(const char *s_, size_t *lengthp)
{
    size_t length = 0;
    const uint8_t *s;

    for (s = (const uint8_t *) s_; *s != '\0'; ) {
        length++;
        if (s[0] < 0x80) {
            s++;
        } else {
            const struct utf8_sequence *seq;
            int i;

            seq = lookup_utf8_sequence(s[0]);
            if (!seq) {
                return invalid_utf8_sequence(s, 1, lengthp);
            }

            for (i = 1; seq->octets[i][0]; i++) {
                const uint8_t *o = seq->octets[i];
                if (s[i] < o[0] || s[i] > o[1]) {
                    return invalid_utf8_sequence(s, i + 1, lengthp);
                }
            }
            s += i;
        }
    }
    if (lengthp) {
        *lengthp = length;
    }
    return NULL;
}
Implement JSON parsing and serialization. This will be used by the upcoming Open vSwitch configuration database. 2009-11-04 14:55:53 -08:00			`/*`
Global replace of Nicira Networks. Replaced all instances of Nicira Networks(, Inc) to Nicira, Inc. Feature #10593 Signed-off-by: Raju Subramanian <rsubramanian@nicira.com> Signed-off-by: Ben Pfaff <blp@nicira.com> 2012-05-02 15:21:36 -07:00			`* Copyright (c) 2009, 2010 Nicira, Inc.`
Implement JSON parsing and serialization. This will be used by the upcoming Open vSwitch configuration database. 2009-11-04 14:55:53 -08:00			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at:`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`#include <config.h>`

			`#include "unicode.h"`

ovsdb: Add simple constraints. 2010-02-08 14:09:36 -08:00			`#include <inttypes.h>`

Move lib/dynamic-string.h to include/openvswitch directory Signed-off-by: Ben Warren <ben@skyportsystems.com> Signed-off-by: Ben Pfaff <blp@ovn.org> 2016-03-03 10:20:46 -08:00			`#include "openvswitch/dynamic-string.h"`
ovsdb: Add simple constraints. 2010-02-08 14:09:36 -08:00			`#include "util.h"`

Implement JSON parsing and serialization. This will be used by the upcoming Open vSwitch configuration database. 2009-11-04 14:55:53 -08:00			`/* Returns the unicode code point corresponding to leading surrogate 'leading'`
			`* and trailing surrogate 'trailing'. The return value will not make any`
			`* sense if 'leading' or 'trailing' are not in the correct ranges for leading`
			`* or trailing surrogates. */`
			`int`
			`utf16_decode_surrogate_pair(int leading, int trailing)`
			`{`
			`/*`
			`* Leading surrogate: 110110wwwwxxxxxx`
			`* Trailing surrogate: 110111xxxxxxxxxx`
			`* Code point: 000uuuuuxxxxxxxxxxxxxxxx`
			`*/`
			`int w = (leading >> 6) & 0xf;`
			`int u = w + 1;`
			`int x0 = leading & 0x3f;`
			`int x1 = trailing & 0x3ff;`
			`return (u << 16) \| (x0 << 10) \| x1;`
			`}`
ovsdb: Add simple constraints. 2010-02-08 14:09:36 -08:00
			`/* Returns the number of Unicode characters in UTF-8 string 's'. */`
			`size_t`
			`utf8_length(const char *s_)`
			`{`
			`const uint8_t *s;`
			`size_t length;`

			`length = 0;`
			`for (s = (const uint8_t ) s_; s != '\0'; s++) {`
			`/* The most-significant bits of the first byte in a character are one`
			`* of 2#01, 2#00, or 2#11. 2#10 is a continuation byte. */`
			`length += (*s & 0xc0) != 0x80;`
			`}`
			`return length;`
			`}`

			`static char *`
			`invalid_utf8_sequence(const uint8_t s, int n, size_t lengthp)`
			`{`
			`struct ds msg;`
			`int i;`

			`if (lengthp) {`
			`*lengthp = 0;`
			`}`

			`ds_init(&msg);`
			`ds_put_cstr(&msg, "invalid UTF-8 sequence");`
			`for (i = 0; i < n; i++) {`
			`ds_put_format(&msg, " 0x%02"PRIx8, s[i]);`
			`}`
			`return ds_steal_cstr(&msg);`
			`}`

			`struct utf8_sequence {`
			`uint8_t octets[5][2];`
			`};`

			`static const struct utf8_sequence *`
			`lookup_utf8_sequence(uint8_t c)`
			`{`
			`static const struct utf8_sequence seqs[] = {`
			`{ { { 0x01, 0x7f },`
			`{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },`

			`{ { { 0xc2, 0xdf }, { 0x80, 0xbf },`
			`{ 0, 0 }, { 0, 0 }, { 0, 0 } } },`

			`{ { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },`
			`{0,0}, {0, 0 } } },`

			`{ { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },`
			`{ 0, 0 }, { 0, 0 } } },`

			`{ { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },`
			`{ 0, 0 }, { 0, 0 } } },`

			`{ { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },`
			`{ 0, 0 }, { 0, 0 } } },`

			`{ { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },`
			`{ 0, 0 } } },`

			`{ { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },`
			`{ 0, 0 } } },`

			`{ { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },`
			`{ 0, 0 } } },`
			`};`

			`size_t i;`

			`for (i = 0; i < ARRAY_SIZE(seqs); i++) {`
			`const uint8_t *o = seqs[i].octets[0];`
			`if (c >= o[0] && c <= o[1]) {`
			`return &seqs[i];`
			`}`
			`}`
			`return NULL;`
			`}`

			`/* Checks that 's' is a valid, null-terminated UTF-8 string. If so, returns a`
			`* null pointer and sets '*lengthp' to the number of Unicode characters in`
			`* 's'. If not, returns an error message that the caller must free and sets`
			`* '*lengthp' to 0.`
			`*`
			`* 'lengthp' may be NULL if the length is not needed. */`
			`char *`
			`utf8_validate(const char s_, size_t lengthp)`
			`{`
			`size_t length = 0;`
			`const uint8_t *s;`

			`for (s = (const uint8_t ) s_; s != '\0'; ) {`
			`length++;`
			`if (s[0] < 0x80) {`
			`s++;`
			`} else {`
			`const struct utf8_sequence *seq;`
			`int i;`

			`seq = lookup_utf8_sequence(s[0]);`
			`if (!seq) {`
			`return invalid_utf8_sequence(s, 1, lengthp);`
			`}`

			`for (i = 1; seq->octets[i][0]; i++) {`
			`const uint8_t *o = seq->octets[i];`
			`if (s[i] < o[0] \|\| s[i] > o[1]) {`
			`return invalid_utf8_sequence(s, i + 1, lengthp);`
			`}`
			`}`
			`s += i;`
			`}`
			`}`
			`if (lengthp) {`
			`*lengthp = length;`
			`}`
			`return NULL;`
			`}`