Follow UAX-29 and present user-perceived character counts

Not a count of code-units, nor a count of code-points, but try and follow http://unicode.org/reports/tr29/ and present the grapheme count. Add a few representative tests to try and avoid gotchas.
2011-11-03 11:00:38 +00:00
parent 56d5a48099
commit 12db5315fc
5 changed files with 76 additions and 14 deletions
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -151,6 +151,24 @@ void TestBreakIterator::testGraphemeIteration()
            i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
        CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
    }
    {
        const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
        ::rtl::OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
        sal_Int32 nGraphemeCount = 0;
        sal_Int32 nCurPos = 0;
        while (nCurPos < aText.getLength())
        {
            sal_Int32 nCount2 = 1;
            nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
                i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
            ++nGraphemeCount;
        }
        CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
    }
 }
 //A test to ensure that certain ranges and codepoints that are categorized as
--- a/sw/inc/breakit.hxx
+++ b/sw/inc/breakit.hxx
@@ -111,6 +111,8 @@ public:
    sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
    sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
    sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const;
 };
 #define SW_BREAKITER()  SwBreakIt::Get()
--- a/sw/qa/core/swdoc-test.cxx
+++ b/sw/qa/core/swdoc-test.cxx
@@ -41,19 +41,20 @@
 #include <sfx2/docfile.hxx>
 #include <sfx2/sfxmodelfactory.hxx>
-#include "init.hxx"
+#include "breakit.hxx"
 #include "swtypes.hxx"
 #include "docstat.hxx"
 #include "doc.hxx"
 #include "ndtxt.hxx"
 #include "docsh.hxx"
-#include "shellres.hxx"
+#include "docstat.hxx"
 #include "docufld.hxx"
 #include "fmtanchr.hxx"
-#include "swscanner.hxx"
+#include "init.hxx"
-#include "swcrsr.hxx"
+#include "ndtxt.hxx"
 #include "swmodule.hxx"
 #include "shellio.hxx"
 #include "shellres.hxx"
 #include "swcrsr.hxx"
 #include "swscanner.hxx"
 #include "swmodule.hxx"
 #include "swtypes.hxx"
 SO2_DECL_REF(SwDocShell)
 SO2_IMPL_REF(SwDocShell)
@@ -73,14 +74,15 @@ public:
    void testFileNameFields();
    void testDocStat();
    void testSwScanner();
    void testUserPerceivedCharCount();
    void testGraphicAnchorDeletion();
    CPPUNIT_TEST_SUITE(SwDocTest);
    CPPUNIT_TEST(randomTest);
    CPPUNIT_TEST(testPageDescName);
    CPPUNIT_TEST(testFileNameFields);
    CPPUNIT_TEST(testDocStat);
    CPPUNIT_TEST(testSwScanner);
    CPPUNIT_TEST(testUserPerceivedCharCount);
    CPPUNIT_TEST(testGraphicAnchorDeletion);
    CPPUNIT_TEST_SUITE_END();
@@ -189,6 +191,28 @@ void SwDocTest::testDocStat()
    CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen);
 }
 //For UI character counts we should follow UAX#29 and display the user
 //perceived characters, not the number of codepoints, nor the number of code
 //units http://unicode.org/reports/tr29/
 void SwDocTest::testUserPerceivedCharCount()
 {
    SwBreakIt *pBreakIter = SwBreakIt::Get();
    //Grapheme example, two different unicode code-points perceived by the user as a single
    //glyph
    const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
    ::rtl::OUString sALEF_QAMATS(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
    sal_Int32 nGraphemeCount = pBreakIter->getGraphemeCount(sALEF_QAMATS);
    CPPUNIT_ASSERT_MESSAGE("Grapheme Count should be 1", nGraphemeCount == 1);
    //Surrogate pair example, one single unicode code-point (U+1D11E)
    //represented as two code units in UTF-8
    const sal_Unicode GCLEF[] = { 0xD834, 0xDD1E };
    ::rtl::OUString sGCLEF(GCLEF, SAL_N_ELEMENTS(GCLEF));
    sal_Int32 nCount = pBreakIter->getGraphemeCount(sGCLEF);
    CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1);
 }
 //See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
 void SwDocTest::testSwScanner()
 {
--- a/sw/source/core/bastyp/breakit.cxx
+++ b/sw/source/core/bastyp/breakit.cxx
@@ -33,6 +33,7 @@
 #include <unicode/uchar.h>
 #include <com/sun/star/lang/XMultiServiceFactory.hpp>
 #include <com/sun/star/i18n/ScriptType.hdl>
 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
 #include <unotools/localedatawrapper.hxx>
 #include <editeng/unolingu.hxx>
@@ -169,4 +170,20 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const
    return nRet;
 }
 sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const
 {
    sal_Int32 nGraphemeCount = 0;
    sal_Int32 nCurPos = 0;
    while (nCurPos < rText.getLength())
    {
        sal_Int32 nCount2 = 1;
        nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
            i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
        ++nGraphemeCount;
    }
    return nGraphemeCount;
 }
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -1862,12 +1862,13 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
            if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
            {
                ++nTmpWords;
-                nTmpCharsExcludingSpaces += aScanner.GetLen();
+                nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
            }
        }
    }
-    nTmpChars = nExpandEnd - nExpandBegin - nNumOfMaskedChars;
+    nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin));
    nTmpChars -= nNumOfMaskedChars;
    // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
    // nor for mid-word selection - set scanner bClip = true at creation
@@ -1889,10 +1890,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
            while ( aScanner.NextWord() )
            {
                ++nTmpWords;
-                nTmpCharsExcludingSpaces += aScanner.GetLen();
+                nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
            }
-            nTmpChars += nNumStringLen;
+            nTmpChars = pBreakIt->getGraphemeCount(aNumString);
        }
        else if ( HasBullet() )
        {