[API CHANGE] rtl_convertTextToUnicode behavior upon erroneous input
<http://udk.openoffice.org/cpp/man/spec/textconversion.html> specifies that
FLAGS_UNDEFINED_ERROR, FLAGS_MBUNDEFINED_ERROR, and FLAGS_INVALID_ERROR: "Read
past the [erroneous] code in the input buffer [...]" But actual behavior of
rtl_convertTextToUnicode for the various rtl_TextEncoding values has been
inconsistent. Some erroneous input (mostly single-byte UNDEFINED and INVALID
ones) has not been consumed at all, some (multi-byte MBUNDEFINED and INVALID)
has been consumed partly, and some has been consumed fully as required.
However, at least since 8dd4265b9d
"CWS-TOOLING:
integrate CWS hb18", Custom8BitToUnicode in sw/source/filter/ww8/ww8par.cxx
appears to rely on the broken behavior of not consuming erroneous input. (It
reads the chunk of valid input with e.g. some RTL_TEXTENCODING_MS_125x that
happens to exhibit the broken behavior of not consuming erroneous input, then
wants to try to re-read the erroneous input with RTL_TEXTENCODING_MS_1252. For
example, opening sw/qa/core/data/ww8/pass/forcepoint50-grfanchor-1.doc triggers
that code. For whatever reason, the am_faksas.dot attached to
<https://bz.apache.org/ooo/show_bug.cgi?id=9240#c1> "Do not show lithuanian
letter 'Š'" appears to not, or at least no longer, trigger that code.)
Therefore, it would be useful to have a mode in which rtl_convertTextToUnicode
does not consume erroneous input. (And I plan on doing changes in
sal/osl/unx/file* that would benefit from that behavior, too.) But changing
rtl_convertTextToUnicode to generally not consume erroneous input would not be
feasible: If calls do not set RTL_TEXTTOUNICODE_FLAGS_FLUSH, part of an
erroneous input can already have been consumed by a previous call, so the
current call cannot undo that.
But a change that looks like it can work is to change the behavior only if
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set. In that case we can at least not consume
the part of an erroneous input that has not yet been consumed by a previous call
(which would necessarily have been done with RTL_TEXTTOUNICODE_FLAGS_FLUSH
unset). The expecation is that code that relies on the don't-consume behavior
will do only single calls with RTL_TEXTTOUNICODE_FLAGS_FLUSH set (so reliably
not consume the complete erroneous input), while other code (which might do
calls in a loop) will not care whether erroneous input has been consumed,
anyway. This can be considered a mild form of behavioral API CHANGE (but note
that the old implementation didn't exhibit the requested behavior anyway).
So all implementations of rtl_convertTextToUnicode for the various
rtl_TextEncoding values have been adapted to the new behavior. The only
exceptions are ImplDummyToUnicode (sal/textenc/textcvt.cxx), which is a special
case anyway used by RTL_TEXTENCODING_DONTKNOW, and two out of three places
(marked with a "TODO" each) in ImplUTF7ToUnicode (sal/textenc/tcvtutf7.cxx),
where it is hard to retrofit the expected behaivor, and RTL_TEXTENCODING_UTF7 is
probably not relevant for the use cases relying on the don't-consume--behavior,
anyway.
Whether a similar change should be done for rtl_convertUnicodeToText can be
examined later.
Change-Id: I1ac2c4cfd99e2a0eca219f9a3855ef110b254855
Reviewed-on: https://gerrit.libreoffice.org/78584
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
This commit is contained in:
@@ -92,6 +92,15 @@ SAL_DLLPUBLIC void SAL_CALL rtl_resetTextToUnicodeContext( rtl_TextToUnicodeConv
|
||||
#define RTL_TEXTTOUNICODE_INFO_INVALID ((sal_uInt32)0x0020)
|
||||
|
||||
/** see http://udk.openoffice.org/cpp/man/spec/textconversion.html
|
||||
|
||||
Deviating from the linked specification, the behavior of
|
||||
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR, RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR, and
|
||||
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR is as follows: When RTL_TEXTTOUNICODE_FLAGS_FLUSH is not
|
||||
set, the erroneous input bytes are consumed as required by the linked specification. But if
|
||||
RTL_TEXTTOUNICODE_FLAGS_FLUSH is set, any of those erroneous input bytes that would have been
|
||||
consumed by this invocation of rtl_convertTextToUnicode (i.e., which had not already been
|
||||
captured in hContext from a previous invocation with RTL_TEXTTOUNICODE_FLAGS_FLUSH unset) are
|
||||
not consumed.
|
||||
*/
|
||||
SAL_DLLPUBLIC sal_Size SAL_CALL rtl_convertTextToUnicode(
|
||||
rtl_TextToUnicodeConverter hConverter,
|
||||
|
@@ -165,7 +165,7 @@ void testSingleByteCharSet(SingleByteCharSet const & rSet) {
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Size(0), nSize);
|
||||
CPPUNIT_ASSERT_EQUAL(nExpectedInfo, nInfo);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Size(0), nConverted);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Size(1), nConverted);
|
||||
|
||||
rtl_destroyTextToUnicodeContext(aConverter, aContext);
|
||||
rtl_destroyTextToUnicodeConverter(aConverter);
|
||||
|
@@ -82,6 +82,7 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
nRow = static_cast< ImplBig5HkscsToUnicodeContext * >(pContext)->m_nRow;
|
||||
@@ -92,9 +93,10 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData,
|
||||
sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++);
|
||||
if (nRow == 0)
|
||||
if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else if (nChar >= 0x81 && nChar <= 0xFE)
|
||||
nRow = nChar;
|
||||
@@ -202,13 +204,15 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData,
|
||||
*pDestBufPtr++
|
||||
= static_cast<sal_Unicode>(pBig5Hkscs2001Data[
|
||||
nOffset + (nChar - nFirst)]);
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
else
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
nRow = 0;
|
||||
}
|
||||
@@ -226,10 +230,16 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData,
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
nRow = 0;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
nRow = 0;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -256,6 +266,10 @@ sal_Size ImplConvertBig5HkscsToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
nRow = 0;
|
||||
break;
|
||||
|
@@ -93,6 +93,7 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
{
|
||||
@@ -109,9 +110,10 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData,
|
||||
{
|
||||
case IMPL_EUC_TW_TO_UNICODE_STATE_0:
|
||||
if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else if (nChar >= 0xA1 && nChar <= 0xFE)
|
||||
{
|
||||
@@ -210,13 +212,15 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData,
|
||||
*pDestBufPtr++
|
||||
= static_cast<sal_Unicode>(pCns116431992Data[
|
||||
nOffset + (nChar - nFirst)]);
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
else
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
}
|
||||
else
|
||||
@@ -234,10 +238,16 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData,
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -264,6 +274,10 @@ sal_Size ImplConvertEucTwToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
|
||||
break;
|
||||
|
@@ -86,6 +86,7 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
{
|
||||
@@ -101,9 +102,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
{
|
||||
case IMPL_GB_18030_TO_UNICODE_STATE_0:
|
||||
if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else if (nChar == 0x80)
|
||||
goto bad_input;
|
||||
@@ -130,9 +132,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
{
|
||||
nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
|
||||
nChar - 0x80 + 63);
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = pGb18030Data[nCode];
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
|
||||
}
|
||||
@@ -170,6 +173,7 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
= static_cast<sal_Unicode>(ImplGetHighSurrogate(nCode));
|
||||
*pDestBufPtr++
|
||||
= static_cast<sal_Unicode>(ImplGetLowSurrogate(nCode));
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
@@ -184,24 +188,26 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
goto bad_input;
|
||||
else if (nCode < pRange->m_nFirstLinear)
|
||||
{
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++
|
||||
= pGb18030Data[
|
||||
pRange->m_nNonRangeDataIndex
|
||||
+ (nCode - nFirstNonRange)];
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
break;
|
||||
}
|
||||
else if (nCode < pRange->m_nPastLinear)
|
||||
{
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++
|
||||
= static_cast<sal_Unicode>(pRange->m_nFirstUnicode
|
||||
+ (nCode
|
||||
- pRange->
|
||||
m_nFirstLinear));
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
break;
|
||||
}
|
||||
@@ -226,10 +232,16 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -256,6 +268,10 @@ sal_Size ImplConvertGb18030ToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
|
||||
break;
|
||||
|
@@ -96,6 +96,7 @@ sal_Size IsciiDevanagariToUnicode::convert(
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode* pDestBufPtr = pDestBuf;
|
||||
sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
while (nConverted < nSrcBytes)
|
||||
{
|
||||
@@ -180,6 +181,10 @@ sal_Size IsciiDevanagariToUnicode::convert(
|
||||
}
|
||||
}
|
||||
|
||||
++nConverted;
|
||||
if (bDouble)
|
||||
++nConverted;
|
||||
|
||||
if (bNormal)
|
||||
cChar = IsciiDevanagariMap[nIn];
|
||||
|
||||
@@ -190,20 +195,24 @@ sal_Size IsciiDevanagariToUnicode::convert(
|
||||
BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
|
||||
bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
|
||||
&nInfo);
|
||||
if (eAction == BAD_INPUT_CONTINUE)
|
||||
if (eAction == BAD_INPUT_CONTINUE) {
|
||||
startOfCurrentChar = nConverted;
|
||||
continue;
|
||||
if (eAction == BAD_INPUT_STOP)
|
||||
}
|
||||
if (eAction == BAD_INPUT_STOP) {
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(eAction == BAD_INPUT_NO_OUTPUT);
|
||||
nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
|
||||
break;
|
||||
}
|
||||
++nConverted;
|
||||
if (bDouble)
|
||||
++nConverted;
|
||||
|
||||
*pDestBufPtr++ = cChar;
|
||||
m_cPrevChar = bNormal ? nIn : 0;
|
||||
startOfCurrentChar = nConverted;
|
||||
}
|
||||
|
||||
if (pInfo)
|
||||
|
@@ -124,6 +124,7 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
{
|
||||
@@ -149,9 +150,10 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
else if (nChar == 0x1B) // ESC
|
||||
eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ESC;
|
||||
else if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else
|
||||
{
|
||||
@@ -203,6 +205,7 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
{
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_SO;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
@@ -332,13 +335,15 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
*pDestBufPtr++
|
||||
= static_cast<sal_Unicode>(pCns116431992Data[
|
||||
nOffset + (nChar - nFirst)]);
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
else
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
}
|
||||
else
|
||||
@@ -358,11 +363,17 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII;
|
||||
b116431 = false;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII;
|
||||
b116431 = false;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -389,6 +400,10 @@ sal_Size ImplConvertIso2022CnToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_CN_TO_UNICODE_STATE_ASCII;
|
||||
b116431 = false;
|
||||
|
@@ -94,6 +94,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
{
|
||||
@@ -111,9 +112,10 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
if (nChar == 0x1B) // ESC
|
||||
eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
|
||||
else if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else
|
||||
{
|
||||
@@ -139,6 +141,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
break;
|
||||
}
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
@@ -178,6 +181,7 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
{
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
@@ -248,10 +252,16 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -278,6 +288,10 @@ sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
|
||||
break;
|
||||
|
@@ -100,6 +100,7 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
|
||||
sal_Size nConverted = 0;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
|
||||
sal_Size startOfCurrentChar = 0;
|
||||
|
||||
if (pContext)
|
||||
{
|
||||
@@ -119,9 +120,10 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
|
||||
else if (nChar == 0x1B) // ESC
|
||||
eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC;
|
||||
else if (nChar < 0x80)
|
||||
if (pDestBufPtr != pDestBufEnd)
|
||||
if (pDestBufPtr != pDestBufEnd) {
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
|
||||
else
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
} else
|
||||
goto no_output;
|
||||
else
|
||||
{
|
||||
@@ -159,6 +161,7 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
|
||||
{
|
||||
*pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
|
||||
eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
}
|
||||
else
|
||||
goto no_output;
|
||||
@@ -211,10 +214,16 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++nConverted;
|
||||
} else {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
|
||||
startOfCurrentChar = nConverted + 1;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -241,6 +250,10 @@ sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
nConverted = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
|
||||
break;
|
||||
|
@@ -540,6 +540,9 @@ sal_Size sal::detail::textenc::convertCharToUnicode(
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_UNDEFINED;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
|
@@ -58,6 +58,9 @@ sal_Size rtl_textenc_convertSingleByteToBmpUnicode(
|
||||
&infoFlags))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((flags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++converted;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
|
@@ -43,6 +43,7 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*,
|
||||
const ImplDBCSToUniLeadTab* pLeadTab = pConvertData->mpToUniLeadTab;
|
||||
sal_Unicode* pEndDestBuf;
|
||||
const char* pEndSrcBuf;
|
||||
char const * startOfCurrentChar = pSrcBuf;
|
||||
|
||||
*pInfo = 0;
|
||||
pEndDestBuf = pDestBuf+nDestChars;
|
||||
@@ -65,12 +66,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_UNDEFINED;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
} else {
|
||||
pSrcBuf = startOfCurrentChar;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE )
|
||||
{
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
continue;
|
||||
}
|
||||
cConv = ImplGetUndefinedUnicodeChar(cLead, nFlags);
|
||||
@@ -158,12 +165,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
} else {
|
||||
pSrcBuf = startOfCurrentChar;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
|
||||
{
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
continue;
|
||||
}
|
||||
cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
|
||||
@@ -176,12 +189,18 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
} else {
|
||||
pSrcBuf = startOfCurrentChar;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE )
|
||||
{
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
continue;
|
||||
}
|
||||
cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
|
||||
@@ -197,6 +216,7 @@ sal_Size ImplDBCSToUnicode( const void* pData, SAL_UNUSED_PARAMETER void*,
|
||||
*pDestBuf = cConv;
|
||||
pDestBuf++;
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
}
|
||||
|
||||
*pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf);
|
||||
@@ -372,6 +392,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData,
|
||||
const ImplEUCJPConvertData* pConvertData = static_cast<const ImplEUCJPConvertData*>(pData);
|
||||
sal_Unicode* pEndDestBuf;
|
||||
const char* pEndSrcBuf;
|
||||
char const * startOfCurrentChar = pSrcBuf;
|
||||
|
||||
*pInfo = 0;
|
||||
pEndDestBuf = pDestBuf+nDestChars;
|
||||
@@ -471,18 +492,29 @@ sal_Size ImplEUCJPToUnicode( const void* pData,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
} else {
|
||||
pSrcBuf = startOfCurrentChar;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE )
|
||||
{
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
continue;
|
||||
}
|
||||
cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
} else {
|
||||
pSrcBuf = startOfCurrentChar;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR )
|
||||
{
|
||||
@@ -492,6 +524,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData,
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE )
|
||||
{
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
continue;
|
||||
}
|
||||
cConv = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
|
||||
@@ -508,6 +541,7 @@ sal_Size ImplEUCJPToUnicode( const void* pData,
|
||||
*pDestBuf = cConv;
|
||||
pDestBuf++;
|
||||
pSrcBuf++;
|
||||
startOfCurrentChar = pSrcBuf;
|
||||
}
|
||||
|
||||
*pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf);
|
||||
|
@@ -254,6 +254,13 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
if (!bEnd) {
|
||||
++pSrcBuf;
|
||||
}
|
||||
} else {
|
||||
//TODO: move pSrcBuf back to a reasonable starting place
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
@@ -303,6 +310,13 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
if (!bEnd) {
|
||||
++pSrcBuf;
|
||||
}
|
||||
} else {
|
||||
//TODO: move pSrcBuf back to a reasonable starting place
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
@@ -344,6 +358,9 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext,
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID;
|
||||
if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR )
|
||||
{
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
++pSrcBuf;
|
||||
}
|
||||
*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR;
|
||||
break;
|
||||
}
|
||||
|
@@ -76,6 +76,7 @@ sal_Size ImplConvertUtf8ToUnicode(
|
||||
unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
|
||||
sal_Unicode * pDestBufPtr = pDestBuf;
|
||||
sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
|
||||
unsigned char const * startOfCurrentChar = pSrcBufPtr;
|
||||
|
||||
if (pContext != nullptr)
|
||||
{
|
||||
@@ -200,6 +201,7 @@ sal_Size ImplConvertUtf8ToUnicode(
|
||||
}
|
||||
nShift = -1;
|
||||
bCheckBom = false;
|
||||
startOfCurrentChar = pSrcBufPtr;
|
||||
continue;
|
||||
|
||||
bad_input:
|
||||
@@ -210,8 +212,12 @@ sal_Size ImplConvertUtf8ToUnicode(
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
nShift = -1;
|
||||
bCheckBom = false;
|
||||
if (!bConsume)
|
||||
--pSrcBufPtr;
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
|
||||
if (!bConsume)
|
||||
--pSrcBufPtr;
|
||||
} else {
|
||||
pSrcBufPtr = startOfCurrentChar;
|
||||
}
|
||||
break;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
@@ -219,6 +225,7 @@ sal_Size ImplConvertUtf8ToUnicode(
|
||||
bCheckBom = false;
|
||||
if (!bConsume)
|
||||
--pSrcBufPtr;
|
||||
startOfCurrentChar = pSrcBufPtr;
|
||||
continue;
|
||||
|
||||
case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
|
||||
@@ -245,6 +252,10 @@ sal_Size ImplConvertUtf8ToUnicode(
|
||||
&nInfo))
|
||||
{
|
||||
case sal::detail::textenc::BAD_INPUT_STOP:
|
||||
if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
|
||||
pSrcBufPtr = startOfCurrentChar;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case sal::detail::textenc::BAD_INPUT_CONTINUE:
|
||||
nShift = -1;
|
||||
bCheckBom = false;
|
||||
|
Reference in New Issue
Block a user