<http://udk.openoffice.org/cpp/man/spec/textconversion.html> specifies that
FLAGS_UNDEFINED_ERROR, FLAGS_MBUNDEFINED_ERROR, and FLAGS_INVALID_ERROR: "Read
past the [erroneous] code in the input buffer [...]" But actual behavior of
rtl_convertTextToUnicode for the various rtl_TextEncoding values has been
inconsistent. Some erroneous input (mostly single-byte UNDEFINED and INVALID
ones) has not been consumed at all, some (multi-byte MBUNDEFINED and INVALID)
has been consumed partly, and some has been consumed fully as required.
However, at least since 8dd4265b9d
"CWS-TOOLING:
integrate CWS hb18", Custom8BitToUnicode in sw/source/filter/ww8/ww8par.cxx
appears to rely on the broken behavior of not consuming erroneous input. (It
reads the chunk of valid input with e.g. some RTL_TEXTENCODING_MS_125x that
happens to exhibit the broken behavior of not consuming erroneous input, then
wants to try to re-read the erroneous input with RTL_TEXTENCODING_MS_1252. For
example, opening sw/qa/core/data/ww8/pass/forcepoint50-grfanchor-1.doc triggers
that code. For whatever reason, the am_faksas.dot attached to
<https://bz.apache.org/ooo/show_bug.cgi?id=9240#c1> "Do not show lithuanian
letter 'Š'" appears to not, or at least no longer, trigger that code.)
Therefore, it would be useful to have a mode in which rtl_convertTextToUnicode
does not consume erroneous input. (And I plan on doing changes in
sal/osl/unx/file* that would benefit from that behavior, too.) But changing
rtl_convertTextToUnicode to generally not consume erroneous input would not be
feasible: If calls do not set RTL_TEXTTOUNICODE_FLAGS_FLUSH, part of an
erroneous input can already have been consumed by a previous call, so the
current call cannot undo that.
But a change that looks like it can work is to change the behavior only if
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set. In that case we can at least not consume
the part of an erroneous input that has not yet been consumed by a previous call
(which would necessarily have been done with RTL_TEXTTOUNICODE_FLAGS_FLUSH
unset). The expecation is that code that relies on the don't-consume behavior
will do only single calls with RTL_TEXTTOUNICODE_FLAGS_FLUSH set (so reliably
not consume the complete erroneous input), while other code (which might do
calls in a loop) will not care whether erroneous input has been consumed,
anyway. This can be considered a mild form of behavioral API CHANGE (but note
that the old implementation didn't exhibit the requested behavior anyway).
So all implementations of rtl_convertTextToUnicode for the various
rtl_TextEncoding values have been adapted to the new behavior. The only
exceptions are ImplDummyToUnicode (sal/textenc/textcvt.cxx), which is a special
case anyway used by RTL_TEXTENCODING_DONTKNOW, and two out of three places
(marked with a "TODO" each) in ImplUTF7ToUnicode (sal/textenc/tcvtutf7.cxx),
where it is hard to retrofit the expected behaivor, and RTL_TEXTENCODING_UTF7 is
probably not relevant for the use cases relying on the don't-consume--behavior,
anyway.
Whether a similar change should be done for rtl_convertUnicodeToText can be
examined later.
Change-Id: I1ac2c4cfd99e2a0eca219f9a3855ef110b254855
Reviewed-on: https://gerrit.libreoffice.org/78584
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
191 lines
9.9 KiB
C
191 lines
9.9 KiB
C
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
#ifndef INCLUDED_RTL_TEXTCVT_H
|
|
#define INCLUDED_RTL_TEXTCVT_H
|
|
|
|
#include "sal/config.h"
|
|
|
|
#include "rtl/textenc.h"
|
|
#include "sal/saldllapi.h"
|
|
#include "sal/types.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/* Documentation about this file can be found at
|
|
<http://udk.openoffice.org/cpp/man/spec/textconversion.html>. */
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
typedef void* rtl_TextToUnicodeConverter;
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
typedef void* rtl_TextToUnicodeContext;
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC rtl_TextToUnicodeConverter SAL_CALL rtl_createTextToUnicodeConverter( rtl_TextEncoding eTextEncoding );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_destroyTextToUnicodeConverter( rtl_TextToUnicodeConverter hConverter );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC rtl_TextToUnicodeContext SAL_CALL rtl_createTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_destroyTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter, rtl_TextToUnicodeContext hContext );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_resetTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter, rtl_TextToUnicodeContext hContext );
|
|
|
|
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR ((sal_uInt32)0x0001)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE ((sal_uInt32)0x0002)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MAPTOPRIVATE ((sal_uInt32)0x0003)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT ((sal_uInt32)0x0004)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR ((sal_uInt32)0x0010)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE ((sal_uInt32)0x0020)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT ((sal_uInt32)0x0030)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ((sal_uInt32)0x0100)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ((sal_uInt32)0x0200)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT ((sal_uInt32)0x0300)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_FLUSH ((sal_uInt32)0x8000)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE 0x10000
|
|
/* Accept any global document signatures (for example, in UTF-8, a leading
|
|
EF BB BF encoding the Byte Order Mark U+FEFF) */
|
|
|
|
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK ((sal_uInt32)0x000F)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK ((sal_uInt32)0x00F0)
|
|
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK ((sal_uInt32)0x0F00)
|
|
|
|
#define RTL_TEXTTOUNICODE_INFO_ERROR ((sal_uInt32)0x0001)
|
|
// Misspelled constant, kept for backwards compatibility:
|
|
#define RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ((sal_uInt32)0x0002)
|
|
#define RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ((sal_uInt32)0x0002)
|
|
// Misspelled constant, kept for backwards compatibility:
|
|
#define RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ((sal_uInt32)0x0004)
|
|
#define RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ((sal_uInt32)0x0004)
|
|
#define RTL_TEXTTOUNICODE_INFO_UNDEFINED ((sal_uInt32)0x0008)
|
|
#define RTL_TEXTTOUNICODE_INFO_MBUNDEFINED ((sal_uInt32)0x0010)
|
|
#define RTL_TEXTTOUNICODE_INFO_INVALID ((sal_uInt32)0x0020)
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
|
|
Deviating from the linked specification, the behavior of
|
|
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR, RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR, and
|
|
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR is as follows: When RTL_TEXTTOUNICODE_FLAGS_FLUSH is not
|
|
set, the erroneous input bytes are consumed as required by the linked specification. But if
|
|
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set, any of those erroneous input bytes that would have been
|
|
consumed by this invocation of rtl_convertTextToUnicode (i.e., which had not already been
|
|
captured in hContext from a previous invocation with RTL_TEXTTOUNICODE_FLAGS_FLUSH unset) are
|
|
not consumed.
|
|
*/
|
|
SAL_DLLPUBLIC sal_Size SAL_CALL rtl_convertTextToUnicode(
|
|
rtl_TextToUnicodeConverter hConverter,
|
|
rtl_TextToUnicodeContext hContext,
|
|
const sal_Char* pSrcBuf, sal_Size nSrcBytes,
|
|
sal_Unicode* pDestBuf, sal_Size nDestChars,
|
|
sal_uInt32 nFlags, sal_uInt32* pInfo,
|
|
sal_Size* pSrcCvtBytes );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
typedef void* rtl_UnicodeToTextConverter;
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
typedef void* rtl_UnicodeToTextContext;
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC rtl_UnicodeToTextConverter SAL_CALL rtl_createUnicodeToTextConverter( rtl_TextEncoding eTextEncoding );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_destroyUnicodeToTextConverter( rtl_UnicodeToTextConverter hConverter );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC rtl_UnicodeToTextContext SAL_CALL rtl_createUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_destroyUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter, rtl_UnicodeToTextContext hContext );
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC void SAL_CALL rtl_resetUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter, rtl_UnicodeToTextContext hContext );
|
|
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR ((sal_uInt32)0x0001)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_IGNORE ((sal_uInt32)0x0002)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_0 ((sal_uInt32)0x0003)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_QUESTIONMARK ((sal_uInt32)0x0004)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_UNDERLINE ((sal_uInt32)0x0005)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT ((sal_uInt32)0x0006)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR ((sal_uInt32)0x0010)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_IGNORE ((sal_uInt32)0x0020)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_0 ((sal_uInt32)0x0030)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_QUESTIONMARK ((sal_uInt32)0x0040)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_UNDERLINE ((sal_uInt32)0x0050)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ((sal_uInt32)0x0060)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE ((sal_uInt32)0x0100)
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR ((sal_uInt32)0x0200)
|
|
#define RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0 ((sal_uInt32)0x0400)
|
|
#define RTL_UNICODETOTEXT_FLAGS_NONSPACING_IGNORE ((sal_uInt32)0x0800)
|
|
#define RTL_UNICODETOTEXT_FLAGS_CONTROL_IGNORE ((sal_uInt32)0x1000)
|
|
#define RTL_UNICODETOTEXT_FLAGS_PRIVATE_IGNORE ((sal_uInt32)0x2000)
|
|
#define RTL_UNICODETOTEXT_FLAGS_NOCOMPOSITE ((sal_uInt32)0x4000) ///< has no effect @deprecated
|
|
#define RTL_UNICODETOTEXT_FLAGS_FLUSH ((sal_uInt32)0x8000)
|
|
#define RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE 0x10000
|
|
/* Write any global document signatures (for example, in UTF-8, a leading
|
|
EF BB BF encoding the Byte Order Mark U+FEFF) */
|
|
|
|
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_MASK ((sal_uInt32)0x000F)
|
|
#define RTL_UNICODETOTEXT_FLAGS_INVALID_MASK ((sal_uInt32)0x00F0)
|
|
|
|
#define RTL_UNICODETOTEXT_INFO_ERROR ((sal_uInt32)0x0001)
|
|
#define RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ((sal_uInt32)0x0002)
|
|
#define RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ((sal_uInt32)0x0004)
|
|
#define RTL_UNICODETOTEXT_INFO_UNDEFINED ((sal_uInt32)0x0008)
|
|
#define RTL_UNICODETOTEXT_INFO_INVALID ((sal_uInt32)0x0010)
|
|
|
|
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
|
*/
|
|
SAL_DLLPUBLIC sal_Size SAL_CALL rtl_convertUnicodeToText(
|
|
rtl_UnicodeToTextConverter hConverter,
|
|
rtl_UnicodeToTextContext hContext,
|
|
const sal_Unicode* pSrcBuf, sal_Size nSrcChars,
|
|
sal_Char* pDestBuf, sal_Size nDestBytes,
|
|
sal_uInt32 nFlags, sal_uInt32* pInfo,
|
|
sal_Size* pSrcCvtChars );
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // INCLUDED_RTL_TEXTCVT_H
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|