tdf#164493 Update script change algorithm to always make progress
Fixes ofz#385256118 Timeout on ImpEditEngine::InitScriptTypes. Previously, the script assignment algorithm could backtrack in certain cases where a script run ends with weak characters that should be included in the following run. Fuzz testing unearthed a case involving right-to-left override and a CJK combining mark, which caused the algorithm to make no progress and hang. The algorithm now always makes progress on each iteration. Change-Id: I4da138c51d391c152afcee2428c21dc762a2dafc Reviewed-on: https://gerrit.libreoffice.org/c/core/+/179888 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
This commit is contained in:
parent
f0efcb9170
commit
1afdda6bca
@ -39,6 +39,7 @@ public:
|
|||||||
void testRtlRunEmbeddedLtrStrong();
|
void testRtlRunEmbeddedLtrStrong();
|
||||||
void testRtlRunEmbeddedLtrWeakComplex();
|
void testRtlRunEmbeddedLtrWeakComplex();
|
||||||
void testRtlRunOverrideCJKAsian();
|
void testRtlRunOverrideCJKAsian();
|
||||||
|
void testTdf164493InfiniteLoop();
|
||||||
|
|
||||||
CPPUNIT_TEST_SUITE(ScriptChangeScannerTest);
|
CPPUNIT_TEST_SUITE(ScriptChangeScannerTest);
|
||||||
CPPUNIT_TEST(testEmpty);
|
CPPUNIT_TEST(testEmpty);
|
||||||
@ -57,6 +58,7 @@ public:
|
|||||||
CPPUNIT_TEST(testRtlRunEmbeddedLtrStrong);
|
CPPUNIT_TEST(testRtlRunEmbeddedLtrStrong);
|
||||||
CPPUNIT_TEST(testRtlRunEmbeddedLtrWeakComplex);
|
CPPUNIT_TEST(testRtlRunEmbeddedLtrWeakComplex);
|
||||||
CPPUNIT_TEST(testRtlRunOverrideCJKAsian);
|
CPPUNIT_TEST(testRtlRunOverrideCJKAsian);
|
||||||
|
CPPUNIT_TEST(testTdf164493InfiniteLoop);
|
||||||
CPPUNIT_TEST_SUITE_END();
|
CPPUNIT_TEST_SUITE_END();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -592,6 +594,27 @@ void ScriptChangeScannerTest::testRtlRunOverrideCJKAsian()
|
|||||||
CPPUNIT_ASSERT(pScanner->AtEnd());
|
CPPUNIT_ASSERT(pScanner->AtEnd());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ScriptChangeScannerTest::testTdf164493InfiniteLoop()
|
||||||
|
{
|
||||||
|
// tdf#164493: Tests a case causing an infinite loop due to interaction
|
||||||
|
// between right-to-left override and a CJK combining mark.
|
||||||
|
|
||||||
|
// U+202E: RIGHT-TO-LEFT OVERRIDE
|
||||||
|
// U+302E: HANGUL SINGLE DOT TONE MARK
|
||||||
|
auto aText = u"\u202e\u302e"_ustr;
|
||||||
|
auto pDirScanner = MakeDirectionChangeScanner(aText, 0);
|
||||||
|
auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner);
|
||||||
|
|
||||||
|
CPPUNIT_ASSERT(!pScanner->AtEnd());
|
||||||
|
CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType);
|
||||||
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
|
||||||
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nEndIndex);
|
||||||
|
|
||||||
|
pScanner->Advance();
|
||||||
|
|
||||||
|
CPPUNIT_ASSERT(pScanner->AtEnd());
|
||||||
|
}
|
||||||
|
|
||||||
CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest);
|
CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,84 +132,28 @@ private:
|
|||||||
ScriptChange m_stCurr;
|
ScriptChange m_stCurr;
|
||||||
DirectionChangeScanner* m_pDirScanner;
|
DirectionChangeScanner* m_pDirScanner;
|
||||||
const OUString& m_rText;
|
const OUString& m_rText;
|
||||||
sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK;
|
|
||||||
sal_Int32 m_nIndex = 0;
|
sal_Int32 m_nIndex = 0;
|
||||||
|
sal_Int32 m_nNextStart = 0;
|
||||||
|
sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK;
|
||||||
bool m_bAtEnd = false;
|
bool m_bAtEnd = false;
|
||||||
bool m_bApplyAsianToWeakQuotes = false;
|
bool m_bApplyAsianToWeakQuotes = false;
|
||||||
|
|
||||||
public:
|
void AdvanceOnce()
|
||||||
GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType,
|
|
||||||
DirectionChangeScanner* pDirScanner)
|
|
||||||
: m_pDirScanner(pDirScanner)
|
|
||||||
, m_rText(rText)
|
|
||||||
{
|
{
|
||||||
// tdf#66791: For compatibility with other programs, the Asian script is
|
m_stCurr
|
||||||
// applied to any weak-script quote characters if the enclosing paragraph
|
= ScriptChange{ /*start*/ m_nNextStart, /*end*/ m_nNextStart, /*type*/ m_nPrevScript };
|
||||||
// contains Chinese- or Japanese-script characters.
|
|
||||||
// In the original Writer algorithm, the application language is used for
|
|
||||||
// all leading weak characters (#94331#). This implementation deviates by
|
|
||||||
// instead using the first-seen non-weak script.
|
|
||||||
sal_Int32 nCjBase = 0;
|
|
||||||
while (nCjBase < m_rText.getLength())
|
|
||||||
{
|
|
||||||
auto nChar = m_rText.iterateCodePoints(&nCjBase);
|
|
||||||
auto nScript = GetScriptClass(nChar);
|
|
||||||
if (m_nPrevScript == css::i18n::ScriptType::WEAK)
|
|
||||||
{
|
|
||||||
m_nPrevScript = nScript;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nScript == css::i18n::ScriptType::COMPLEX)
|
if (m_nNextStart >= m_rText.getLength())
|
||||||
{
|
|
||||||
m_bApplyAsianToWeakQuotes = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT);
|
|
||||||
switch (nUnicodeScript)
|
|
||||||
{
|
|
||||||
case USCRIPT_HAN:
|
|
||||||
case USCRIPT_HIRAGANA:
|
|
||||||
case USCRIPT_KATAKANA:
|
|
||||||
m_bApplyAsianToWeakQuotes = true;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fall back to the application language for leading weak characters if a
|
|
||||||
// better candidate was not found.
|
|
||||||
if (m_nPrevScript == css::i18n::ScriptType::WEAK)
|
|
||||||
{
|
|
||||||
m_nPrevScript = nDefaultScriptType;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make a change record for leading weak characters.
|
|
||||||
Advance();
|
|
||||||
if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex)
|
|
||||||
{
|
|
||||||
// The text does not start with weak characters.
|
|
||||||
// Initialize with a non-empty record.
|
|
||||||
Advance();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AtEnd() const override { return m_bAtEnd; }
|
|
||||||
|
|
||||||
void Advance() override
|
|
||||||
{
|
|
||||||
m_stCurr = ScriptChange{ /*start*/ 0, /*end*/ 0, /*type*/ m_nPrevScript };
|
|
||||||
|
|
||||||
if (m_nIndex >= m_rText.getLength())
|
|
||||||
{
|
{
|
||||||
m_bAtEnd = true;
|
m_bAtEnd = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto nRunStart = m_nIndex;
|
auto nRunStart = m_nNextStart;
|
||||||
|
m_nNextStart = m_nIndex;
|
||||||
|
|
||||||
auto nScript = m_nPrevScript;
|
auto nScript = m_nPrevScript;
|
||||||
|
|
||||||
while (m_nIndex < m_rText.getLength())
|
while (m_nIndex < m_rText.getLength())
|
||||||
{
|
{
|
||||||
auto nPrevIndex = m_nIndex;
|
auto nPrevIndex = m_nIndex;
|
||||||
@ -262,23 +206,25 @@ public:
|
|||||||
|
|
||||||
if (nScript != m_nPrevScript)
|
if (nScript != m_nPrevScript)
|
||||||
{
|
{
|
||||||
m_nIndex = nPrevIndex;
|
m_nNextStart = nPrevIndex;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_nNextStart = m_nIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_nIndex > 0)
|
if (m_nNextStart > 0)
|
||||||
{
|
{
|
||||||
// special case for dotted circle since it can be used with complex
|
// special case for dotted circle since it can be used with complex
|
||||||
// before a mark, so we want it associated with the mark's script
|
// before a mark, so we want it associated with the mark's script
|
||||||
// tdf#112594: another special case for NNBSP followed by a Mongolian
|
// tdf#112594: another special case for NNBSP followed by a Mongolian
|
||||||
// character, since NNBSP has special uses in Mongolian (tdf#112594)
|
// character, since NNBSP has special uses in Mongolian (tdf#112594)
|
||||||
auto nPrevPos = m_nIndex;
|
auto nPrevPos = m_nNextStart;
|
||||||
auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1);
|
auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1);
|
||||||
if (m_nIndex < m_rText.getLength()
|
if (m_nNextStart < m_rText.getLength()
|
||||||
&& css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar))
|
&& css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar))
|
||||||
{
|
{
|
||||||
auto nChar = m_rText.iterateCodePoints(&m_nIndex, 0);
|
auto nChar = m_rText.iterateCodePoints(&m_nNextStart, 0);
|
||||||
auto nType = unicode::getUnicodeType(nChar);
|
auto nType = unicode::getUnicodeType(nChar);
|
||||||
if (nType == css::i18n::UnicodeType::NON_SPACING_MARK
|
if (nType == css::i18n::UnicodeType::NON_SPACING_MARK
|
||||||
|| nType == css::i18n::UnicodeType::ENCLOSING_MARK
|
|| nType == css::i18n::UnicodeType::ENCLOSING_MARK
|
||||||
@ -286,15 +232,77 @@ public:
|
|||||||
|| (nPrevChar == CHAR_NNBSP
|
|| (nPrevChar == CHAR_NNBSP
|
||||||
&& u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN))
|
&& u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN))
|
||||||
{
|
{
|
||||||
m_nIndex = nPrevPos;
|
m_nNextStart = nPrevPos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m_stCurr = ScriptChange{ nRunStart, m_nIndex, m_nPrevScript };
|
m_stCurr = ScriptChange{ nRunStart, m_nNextStart, m_nPrevScript };
|
||||||
m_nPrevScript = nScript;
|
m_nPrevScript = nScript;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType,
|
||||||
|
DirectionChangeScanner* pDirScanner)
|
||||||
|
: m_pDirScanner(pDirScanner)
|
||||||
|
, m_rText(rText)
|
||||||
|
{
|
||||||
|
// tdf#66791: For compatibility with other programs, the Asian script is
|
||||||
|
// applied to any weak-script quote characters if the enclosing paragraph
|
||||||
|
// contains Chinese- or Japanese-script characters.
|
||||||
|
// In the original Writer algorithm, the application language is used for
|
||||||
|
// all leading weak characters (#94331#). This implementation deviates by
|
||||||
|
// instead using the first-seen non-weak script.
|
||||||
|
sal_Int32 nCjBase = 0;
|
||||||
|
while (nCjBase < m_rText.getLength())
|
||||||
|
{
|
||||||
|
auto nChar = m_rText.iterateCodePoints(&nCjBase);
|
||||||
|
auto nScript = GetScriptClass(nChar);
|
||||||
|
if (m_nPrevScript == css::i18n::ScriptType::WEAK)
|
||||||
|
{
|
||||||
|
m_nPrevScript = nScript;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nScript == css::i18n::ScriptType::COMPLEX)
|
||||||
|
{
|
||||||
|
m_bApplyAsianToWeakQuotes = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT);
|
||||||
|
switch (nUnicodeScript)
|
||||||
|
{
|
||||||
|
case USCRIPT_HAN:
|
||||||
|
case USCRIPT_HIRAGANA:
|
||||||
|
case USCRIPT_KATAKANA:
|
||||||
|
m_bApplyAsianToWeakQuotes = true;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to the application language for leading weak characters if a
|
||||||
|
// better candidate was not found.
|
||||||
|
if (m_nPrevScript == css::i18n::ScriptType::WEAK)
|
||||||
|
{
|
||||||
|
m_nPrevScript = nDefaultScriptType;
|
||||||
|
}
|
||||||
|
|
||||||
|
Advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AtEnd() const override { return m_bAtEnd; }
|
||||||
|
|
||||||
|
void Advance() override
|
||||||
|
{
|
||||||
|
do
|
||||||
|
{
|
||||||
|
AdvanceOnce();
|
||||||
|
} while (!AtEnd() && (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex));
|
||||||
|
}
|
||||||
|
|
||||||
ScriptChange Peek() const override { return m_stCurr; }
|
ScriptChange Peek() const override { return m_stCurr; }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user