tdf#164493 Update script change algorithm to always make progress

Fixes ofz#385256118 Timeout on ImpEditEngine::InitScriptTypes. Previously, the script assignment algorithm could backtrack in certain cases where a script run ends with weak characters that should be included in the following run. Fuzz testing unearthed a case involving right-to-left override and a CJK combining mark, which caused the algorithm to make no progress and hang. The algorithm now always makes progress on each iteration. Change-Id: I4da138c51d391c152afcee2428c21dc762a2dafc Reviewed-on: https://gerrit.libreoffice.org/c/core/+/179888 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
2025-01-07 05:52:27 -07:00 · 2025-01-07 05:52:27 -07:00 · 1afdda6bca
commit 1afdda6bca
parent f0efcb9170
2 changed files with 104 additions and 73 deletions
--- a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx
+++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx
@ -39,6 +39,7 @@ public:
    void testRtlRunEmbeddedLtrStrong();
    void testRtlRunEmbeddedLtrWeakComplex();
    void testRtlRunOverrideCJKAsian();
    void testTdf164493InfiniteLoop();
    CPPUNIT_TEST_SUITE(ScriptChangeScannerTest);
    CPPUNIT_TEST(testEmpty);
@ -57,6 +58,7 @@ public:
    CPPUNIT_TEST(testRtlRunEmbeddedLtrStrong);
    CPPUNIT_TEST(testRtlRunEmbeddedLtrWeakComplex);
    CPPUNIT_TEST(testRtlRunOverrideCJKAsian);
    CPPUNIT_TEST(testTdf164493InfiniteLoop);
    CPPUNIT_TEST_SUITE_END();
 };
@ -592,6 +594,27 @@ void ScriptChangeScannerTest::testRtlRunOverrideCJKAsian()
    CPPUNIT_ASSERT(pScanner->AtEnd());
 }
 void ScriptChangeScannerTest::testTdf164493InfiniteLoop()
 {
    // tdf#164493: Tests a case causing an infinite loop due to interaction
    // between right-to-left override and a CJK combining mark.
    // U+202E: RIGHT-TO-LEFT OVERRIDE
    // U+302E: HANGUL SINGLE DOT TONE MARK
    auto aText = u"\u202e\u302e"_ustr;
    auto pDirScanner = MakeDirectionChangeScanner(aText, 0);
    auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner);
    CPPUNIT_ASSERT(!pScanner->AtEnd());
    CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType);
    CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex);
    CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nEndIndex);
    pScanner->Advance();
    CPPUNIT_ASSERT(pScanner->AtEnd());
 }
 CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest);
 }
--- a/i18nutil/source/utility/scriptchangescanner.cxx
+++ b/i18nutil/source/utility/scriptchangescanner.cxx
@ -132,84 +132,28 @@ private:
    ScriptChange m_stCurr;
    DirectionChangeScanner* m_pDirScanner;
    const OUString& m_rText;
    sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK;
    sal_Int32 m_nIndex = 0;
    sal_Int32 m_nNextStart = 0;
    sal_Int16 m_nPrevScript = css::i18n::ScriptType::WEAK;
    bool m_bAtEnd = false;
    bool m_bApplyAsianToWeakQuotes = false;
-public:
+    void AdvanceOnce()
    GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType,
                              DirectionChangeScanner* pDirScanner)
        : m_pDirScanner(pDirScanner)
        , m_rText(rText)
    {
-        // tdf#66791: For compatibility with other programs, the Asian script is
+        m_stCurr
-        // applied to any weak-script quote characters if the enclosing paragraph
+            = ScriptChange{ /*start*/ m_nNextStart, /*end*/ m_nNextStart, /*type*/ m_nPrevScript };
        // contains Chinese- or Japanese-script characters.
        // In the original Writer algorithm, the application language is used for
        // all leading weak characters (#94331#). This implementation deviates by
        // instead using the first-seen non-weak script.
        sal_Int32 nCjBase = 0;
        while (nCjBase < m_rText.getLength())
        {
            auto nChar = m_rText.iterateCodePoints(&nCjBase);
            auto nScript = GetScriptClass(nChar);
            if (m_nPrevScript == css::i18n::ScriptType::WEAK)
            {
                m_nPrevScript = nScript;
            }
-            if (nScript == css::i18n::ScriptType::COMPLEX)
+        if (m_nNextStart >= m_rText.getLength())
            {
                m_bApplyAsianToWeakQuotes = false;
                break;
            }
            auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT);
            switch (nUnicodeScript)
            {
                case USCRIPT_HAN:
                case USCRIPT_HIRAGANA:
                case USCRIPT_KATAKANA:
                    m_bApplyAsianToWeakQuotes = true;
                    break;
                default:
                    break;
            }
        }
        // Fall back to the application language for leading weak characters if a
        // better candidate was not found.
        if (m_nPrevScript == css::i18n::ScriptType::WEAK)
        {
            m_nPrevScript = nDefaultScriptType;
        }
        // Make a change record for leading weak characters.
        Advance();
        if (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex)
        {
            // The text does not start with weak characters.
            // Initialize with a non-empty record.
            Advance();
        }
    }
    bool AtEnd() const override { return m_bAtEnd; }
    void Advance() override
    {
        m_stCurr = ScriptChange{ /*start*/ 0, /*end*/ 0, /*type*/ m_nPrevScript };
        if (m_nIndex >= m_rText.getLength())
        {
            m_bAtEnd = true;
            return;
        }
-        auto nRunStart = m_nIndex;
+        auto nRunStart = m_nNextStart;
        m_nNextStart = m_nIndex;
        auto nScript = m_nPrevScript;
        while (m_nIndex < m_rText.getLength())
        {
            auto nPrevIndex = m_nIndex;
@ -262,23 +206,25 @@ public:
            if (nScript != m_nPrevScript)
            {
-                m_nIndex = nPrevIndex;
+                m_nNextStart = nPrevIndex;
                break;
            }
            m_nNextStart = m_nIndex;
        }
-        if (m_nIndex > 0)
+        if (m_nNextStart > 0)
        {
            // special case for dotted circle since it can be used with complex
            // before a mark, so we want it associated with the mark's script
            // tdf#112594: another special case for NNBSP followed by a Mongolian
            // character, since NNBSP has special uses in Mongolian (tdf#112594)
-            auto nPrevPos = m_nIndex;
+            auto nPrevPos = m_nNextStart;
            auto nPrevChar = m_rText.iterateCodePoints(&nPrevPos, -1);
-            if (m_nIndex < m_rText.getLength()
+            if (m_nNextStart < m_rText.getLength()
                && css::i18n::ScriptType::WEAK == GetScriptClass(nPrevChar))
            {
-                auto nChar = m_rText.iterateCodePoints(&m_nIndex, 0);
+                auto nChar = m_rText.iterateCodePoints(&m_nNextStart, 0);
                auto nType = unicode::getUnicodeType(nChar);
                if (nType == css::i18n::UnicodeType::NON_SPACING_MARK
                    || nType == css::i18n::UnicodeType::ENCLOSING_MARK
@ -286,15 +232,77 @@ public:
                    || (nPrevChar == CHAR_NNBSP
                        && u_getIntPropertyValue(nChar, UCHAR_SCRIPT) == USCRIPT_MONGOLIAN))
                {
-                    m_nIndex = nPrevPos;
+                    m_nNextStart = nPrevPos;
                }
            }
        }
-        m_stCurr = ScriptChange{ nRunStart, m_nIndex, m_nPrevScript };
+        m_stCurr = ScriptChange{ nRunStart, m_nNextStart, m_nPrevScript };
        m_nPrevScript = nScript;
    }
 public:
    GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType,
                              DirectionChangeScanner* pDirScanner)
        : m_pDirScanner(pDirScanner)
        , m_rText(rText)
    {
        // tdf#66791: For compatibility with other programs, the Asian script is
        // applied to any weak-script quote characters if the enclosing paragraph
        // contains Chinese- or Japanese-script characters.
        // In the original Writer algorithm, the application language is used for
        // all leading weak characters (#94331#). This implementation deviates by
        // instead using the first-seen non-weak script.
        sal_Int32 nCjBase = 0;
        while (nCjBase < m_rText.getLength())
        {
            auto nChar = m_rText.iterateCodePoints(&nCjBase);
            auto nScript = GetScriptClass(nChar);
            if (m_nPrevScript == css::i18n::ScriptType::WEAK)
            {
                m_nPrevScript = nScript;
            }
            if (nScript == css::i18n::ScriptType::COMPLEX)
            {
                m_bApplyAsianToWeakQuotes = false;
                break;
            }
            auto nUnicodeScript = u_getIntPropertyValue(nChar, UCHAR_SCRIPT);
            switch (nUnicodeScript)
            {
                case USCRIPT_HAN:
                case USCRIPT_HIRAGANA:
                case USCRIPT_KATAKANA:
                    m_bApplyAsianToWeakQuotes = true;
                    break;
                default:
                    break;
            }
        }
        // Fall back to the application language for leading weak characters if a
        // better candidate was not found.
        if (m_nPrevScript == css::i18n::ScriptType::WEAK)
        {
            m_nPrevScript = nDefaultScriptType;
        }
        Advance();
    }
    bool AtEnd() const override { return m_bAtEnd; }
    void Advance() override
    {
        do
        {
            AdvanceOnce();
        } while (!AtEnd() && (m_stCurr.m_nStartIndex == m_stCurr.m_nEndIndex));
    }
    ScriptChange Peek() const override { return m_stCurr; }
 };
 }