Files
libreoffice/include/rtl/textcvt.h
Stephan Bergmann ca6ddfcc93 [API CHANGE] rtl_convertTextToUnicode behavior upon erroneous input
<http://udk.openoffice.org/cpp/man/spec/textconversion.html> specifies that
FLAGS_UNDEFINED_ERROR, FLAGS_MBUNDEFINED_ERROR, and FLAGS_INVALID_ERROR: "Read
past the [erroneous] code in the input buffer [...]"  But actual behavior of
rtl_convertTextToUnicode for the various rtl_TextEncoding values has been
inconsistent.  Some erroneous input (mostly single-byte UNDEFINED and INVALID
ones) has not been consumed at all, some (multi-byte MBUNDEFINED and INVALID)
has been consumed partly, and some has been consumed fully as required.

However, at least since 8dd4265b9d "CWS-TOOLING:
integrate CWS hb18", Custom8BitToUnicode in sw/source/filter/ww8/ww8par.cxx
appears to rely on the broken behavior of not consuming erroneous input.  (It
reads the chunk of valid input with e.g. some RTL_TEXTENCODING_MS_125x that
happens to exhibit the broken behavior of not consuming erroneous input, then
wants to try to re-read the erroneous input with RTL_TEXTENCODING_MS_1252.  For
example, opening sw/qa/core/data/ww8/pass/forcepoint50-grfanchor-1.doc triggers
that code.  For whatever reason, the am_faksas.dot attached to
<https://bz.apache.org/ooo/show_bug.cgi?id=9240#c1> "Do not show lithuanian
letter 'Š'" appears to not, or at least no longer, trigger that code.)

Therefore, it would be useful to have a mode in which rtl_convertTextToUnicode
does not consume erroneous input.  (And I plan on doing changes in
sal/osl/unx/file* that would benefit from that behavior, too.)  But changing
rtl_convertTextToUnicode to generally not consume erroneous input would not be
feasible:  If calls do not set RTL_TEXTTOUNICODE_FLAGS_FLUSH, part of an
erroneous input can already have been consumed by a previous call, so the
current call cannot undo that.

But a change that looks like it can work is to change the behavior only if
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set.  In that case we can at least not consume
the part of an erroneous input that has not yet been consumed by a previous call
(which would necessarily have been done with RTL_TEXTTOUNICODE_FLAGS_FLUSH
unset).  The expecation is that code that relies on the don't-consume behavior
will do only single calls with RTL_TEXTTOUNICODE_FLAGS_FLUSH set (so reliably
not consume the complete erroneous input), while other code (which might do
calls in a loop) will not care whether erroneous input has been consumed,
anyway.  This can be considered a mild form of behavioral API CHANGE (but note
that the old implementation didn't exhibit the requested behavior anyway).

So all implementations of rtl_convertTextToUnicode for the various
rtl_TextEncoding values have been adapted to the new behavior.  The only
exceptions are ImplDummyToUnicode (sal/textenc/textcvt.cxx), which is a special
case anyway used by RTL_TEXTENCODING_DONTKNOW, and two out of three places
(marked with a "TODO" each) in ImplUTF7ToUnicode (sal/textenc/tcvtutf7.cxx),
where it is hard to retrofit the expected behaivor, and RTL_TEXTENCODING_UTF7 is
probably not relevant for the use cases relying on the don't-consume--behavior,
anyway.

Whether a similar change should be done for rtl_convertUnicodeToText can be
examined later.

Change-Id: I1ac2c4cfd99e2a0eca219f9a3855ef110b254855
Reviewed-on: https://gerrit.libreoffice.org/78584
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
2019-09-04 13:02:11 +02:00

191 lines
9.9 KiB
C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#ifndef INCLUDED_RTL_TEXTCVT_H
#define INCLUDED_RTL_TEXTCVT_H
#include "sal/config.h"
#include "rtl/textenc.h"
#include "sal/saldllapi.h"
#include "sal/types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Documentation about this file can be found at
<http://udk.openoffice.org/cpp/man/spec/textconversion.html>. */
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
typedef void* rtl_TextToUnicodeConverter;
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
typedef void* rtl_TextToUnicodeContext;
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC rtl_TextToUnicodeConverter SAL_CALL rtl_createTextToUnicodeConverter( rtl_TextEncoding eTextEncoding );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_destroyTextToUnicodeConverter( rtl_TextToUnicodeConverter hConverter );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC rtl_TextToUnicodeContext SAL_CALL rtl_createTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_destroyTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter, rtl_TextToUnicodeContext hContext );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_resetTextToUnicodeContext( rtl_TextToUnicodeConverter hConverter, rtl_TextToUnicodeContext hContext );
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR ((sal_uInt32)0x0001)
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE ((sal_uInt32)0x0002)
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MAPTOPRIVATE ((sal_uInt32)0x0003)
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT ((sal_uInt32)0x0004)
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR ((sal_uInt32)0x0010)
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE ((sal_uInt32)0x0020)
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT ((sal_uInt32)0x0030)
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ((sal_uInt32)0x0100)
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ((sal_uInt32)0x0200)
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT ((sal_uInt32)0x0300)
#define RTL_TEXTTOUNICODE_FLAGS_FLUSH ((sal_uInt32)0x8000)
#define RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE 0x10000
/* Accept any global document signatures (for example, in UTF-8, a leading
EF BB BF encoding the Byte Order Mark U+FEFF) */
#define RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK ((sal_uInt32)0x000F)
#define RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK ((sal_uInt32)0x00F0)
#define RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK ((sal_uInt32)0x0F00)
#define RTL_TEXTTOUNICODE_INFO_ERROR ((sal_uInt32)0x0001)
// Misspelled constant, kept for backwards compatibility:
#define RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ((sal_uInt32)0x0002)
#define RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ((sal_uInt32)0x0002)
// Misspelled constant, kept for backwards compatibility:
#define RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ((sal_uInt32)0x0004)
#define RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ((sal_uInt32)0x0004)
#define RTL_TEXTTOUNICODE_INFO_UNDEFINED ((sal_uInt32)0x0008)
#define RTL_TEXTTOUNICODE_INFO_MBUNDEFINED ((sal_uInt32)0x0010)
#define RTL_TEXTTOUNICODE_INFO_INVALID ((sal_uInt32)0x0020)
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
Deviating from the linked specification, the behavior of
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR, RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR, and
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR is as follows: When RTL_TEXTTOUNICODE_FLAGS_FLUSH is not
set, the erroneous input bytes are consumed as required by the linked specification. But if
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set, any of those erroneous input bytes that would have been
consumed by this invocation of rtl_convertTextToUnicode (i.e., which had not already been
captured in hContext from a previous invocation with RTL_TEXTTOUNICODE_FLAGS_FLUSH unset) are
not consumed.
*/
SAL_DLLPUBLIC sal_Size SAL_CALL rtl_convertTextToUnicode(
rtl_TextToUnicodeConverter hConverter,
rtl_TextToUnicodeContext hContext,
const sal_Char* pSrcBuf, sal_Size nSrcBytes,
sal_Unicode* pDestBuf, sal_Size nDestChars,
sal_uInt32 nFlags, sal_uInt32* pInfo,
sal_Size* pSrcCvtBytes );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
typedef void* rtl_UnicodeToTextConverter;
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
typedef void* rtl_UnicodeToTextContext;
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC rtl_UnicodeToTextConverter SAL_CALL rtl_createUnicodeToTextConverter( rtl_TextEncoding eTextEncoding );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_destroyUnicodeToTextConverter( rtl_UnicodeToTextConverter hConverter );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC rtl_UnicodeToTextContext SAL_CALL rtl_createUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_destroyUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter, rtl_UnicodeToTextContext hContext );
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC void SAL_CALL rtl_resetUnicodeToTextContext( rtl_UnicodeToTextConverter hConverter, rtl_UnicodeToTextContext hContext );
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR ((sal_uInt32)0x0001)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_IGNORE ((sal_uInt32)0x0002)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_0 ((sal_uInt32)0x0003)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_QUESTIONMARK ((sal_uInt32)0x0004)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_UNDERLINE ((sal_uInt32)0x0005)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT ((sal_uInt32)0x0006)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR ((sal_uInt32)0x0010)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_IGNORE ((sal_uInt32)0x0020)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_0 ((sal_uInt32)0x0030)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_QUESTIONMARK ((sal_uInt32)0x0040)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_UNDERLINE ((sal_uInt32)0x0050)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ((sal_uInt32)0x0060)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE ((sal_uInt32)0x0100)
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR ((sal_uInt32)0x0200)
#define RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0 ((sal_uInt32)0x0400)
#define RTL_UNICODETOTEXT_FLAGS_NONSPACING_IGNORE ((sal_uInt32)0x0800)
#define RTL_UNICODETOTEXT_FLAGS_CONTROL_IGNORE ((sal_uInt32)0x1000)
#define RTL_UNICODETOTEXT_FLAGS_PRIVATE_IGNORE ((sal_uInt32)0x2000)
#define RTL_UNICODETOTEXT_FLAGS_NOCOMPOSITE ((sal_uInt32)0x4000) ///< has no effect @deprecated
#define RTL_UNICODETOTEXT_FLAGS_FLUSH ((sal_uInt32)0x8000)
#define RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE 0x10000
/* Write any global document signatures (for example, in UTF-8, a leading
EF BB BF encoding the Byte Order Mark U+FEFF) */
#define RTL_UNICODETOTEXT_FLAGS_UNDEFINED_MASK ((sal_uInt32)0x000F)
#define RTL_UNICODETOTEXT_FLAGS_INVALID_MASK ((sal_uInt32)0x00F0)
#define RTL_UNICODETOTEXT_INFO_ERROR ((sal_uInt32)0x0001)
#define RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ((sal_uInt32)0x0002)
#define RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ((sal_uInt32)0x0004)
#define RTL_UNICODETOTEXT_INFO_UNDEFINED ((sal_uInt32)0x0008)
#define RTL_UNICODETOTEXT_INFO_INVALID ((sal_uInt32)0x0010)
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
*/
SAL_DLLPUBLIC sal_Size SAL_CALL rtl_convertUnicodeToText(
rtl_UnicodeToTextConverter hConverter,
rtl_UnicodeToTextContext hContext,
const sal_Unicode* pSrcBuf, sal_Size nSrcChars,
sal_Char* pDestBuf, sal_Size nDestBytes,
sal_uInt32 nFlags, sal_uInt32* pInfo,
sal_Size* pSrcCvtChars );
#ifdef __cplusplus
}
#endif
#endif // INCLUDED_RTL_TEXTCVT_H
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */