diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index ab5936aa0934..dfe342490c3d 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -151,6 +151,24 @@ void TestBreakIterator::testGraphemeIteration() i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0); } + + { + const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 }; + ::rtl::OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS)); + + sal_Int32 nGraphemeCount = 0; + + sal_Int32 nCurPos = 0; + while (nCurPos < aText.getLength()) + { + sal_Int32 nCount2 = 1; + nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(), + i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2); + ++nGraphemeCount; + } + + CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1); + } } //A test to ensure that certain ranges and codepoints that are categorized as diff --git a/sw/inc/breakit.hxx b/sw/inc/breakit.hxx index ae05468aa16b..3075fc9dfe63 100644 --- a/sw/inc/breakit.hxx +++ b/sw/inc/breakit.hxx @@ -110,7 +110,9 @@ public: } sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const; - sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const; + sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const; + + sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const; }; #define SW_BREAKITER() SwBreakIt::Get() diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx index 3d27fa071d39..d6f145f0d9bf 100644 --- a/sw/qa/core/swdoc-test.cxx +++ b/sw/qa/core/swdoc-test.cxx @@ -41,19 +41,20 @@ #include #include -#include "init.hxx" -#include "swtypes.hxx" -#include "docstat.hxx" +#include "breakit.hxx" #include "doc.hxx" -#include "ndtxt.hxx" #include "docsh.hxx" -#include "shellres.hxx" +#include "docstat.hxx" #include "docufld.hxx" #include "fmtanchr.hxx" -#include "swscanner.hxx" -#include "swcrsr.hxx" -#include "swmodule.hxx" +#include "init.hxx" +#include "ndtxt.hxx" #include "shellio.hxx" +#include "shellres.hxx" +#include "swcrsr.hxx" +#include "swscanner.hxx" +#include "swmodule.hxx" +#include "swtypes.hxx" SO2_DECL_REF(SwDocShell) SO2_IMPL_REF(SwDocShell) @@ -73,14 +74,15 @@ public: void testFileNameFields(); void testDocStat(); void testSwScanner(); + void testUserPerceivedCharCount(); void testGraphicAnchorDeletion(); CPPUNIT_TEST_SUITE(SwDocTest); CPPUNIT_TEST(randomTest); CPPUNIT_TEST(testPageDescName); CPPUNIT_TEST(testFileNameFields); - CPPUNIT_TEST(testDocStat); CPPUNIT_TEST(testSwScanner); + CPPUNIT_TEST(testUserPerceivedCharCount); CPPUNIT_TEST(testGraphicAnchorDeletion); CPPUNIT_TEST_SUITE_END(); @@ -189,6 +191,28 @@ void SwDocTest::testDocStat() CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen); } +//For UI character counts we should follow UAX#29 and display the user +//perceived characters, not the number of codepoints, nor the number of code +//units http://unicode.org/reports/tr29/ +void SwDocTest::testUserPerceivedCharCount() +{ + SwBreakIt *pBreakIter = SwBreakIt::Get(); + + //Grapheme example, two different unicode code-points perceived by the user as a single + //glyph + const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 }; + ::rtl::OUString sALEF_QAMATS(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS)); + sal_Int32 nGraphemeCount = pBreakIter->getGraphemeCount(sALEF_QAMATS); + CPPUNIT_ASSERT_MESSAGE("Grapheme Count should be 1", nGraphemeCount == 1); + + //Surrogate pair example, one single unicode code-point (U+1D11E) + //represented as two code units in UTF-8 + const sal_Unicode GCLEF[] = { 0xD834, 0xDD1E }; + ::rtl::OUString sGCLEF(GCLEF, SAL_N_ELEMENTS(GCLEF)); + sal_Int32 nCount = pBreakIter->getGraphemeCount(sGCLEF); + CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1); +} + //See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation void SwDocTest::testSwScanner() { diff --git a/sw/source/core/bastyp/breakit.cxx b/sw/source/core/bastyp/breakit.cxx index 69a35b7f9f4b..2435aae33180 100644 --- a/sw/source/core/bastyp/breakit.cxx +++ b/sw/source/core/bastyp/breakit.cxx @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -169,4 +170,20 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const return nRet; } +sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const +{ + sal_Int32 nGraphemeCount = 0; + + sal_Int32 nCurPos = 0; + while (nCurPos < rText.getLength()) + { + sal_Int32 nCount2 = 1; + nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(), + i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2); + ++nGraphemeCount; + } + + return nGraphemeCount; +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx index 0c70225b8eee..8fe6051f4fb8 100644 --- a/sw/source/core/txtnode/txtedt.cxx +++ b/sw/source/core/txtnode/txtedt.cxx @@ -1862,12 +1862,13 @@ void SwTxtNode::CountWords( SwDocStat& rStat, if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() )) { ++nTmpWords; - nTmpCharsExcludingSpaces += aScanner.GetLen(); + nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord()); } } } - nTmpChars = nExpandEnd - nExpandBegin - nNumOfMaskedChars; + nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin)); + nTmpChars -= nNumOfMaskedChars; // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars // nor for mid-word selection - set scanner bClip = true at creation @@ -1889,10 +1890,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat, while ( aScanner.NextWord() ) { ++nTmpWords; - nTmpCharsExcludingSpaces += aScanner.GetLen(); + nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord()); } - nTmpChars += nNumStringLen; + nTmpChars = pBreakIt->getGraphemeCount(aNumString); } else if ( HasBullet() ) {