Follow UAX-29 and present user-perceived character counts
Not a count of code-units, nor a count of code-points, but try and follow http://unicode.org/reports/tr29/ and present the grapheme count. Add a few representative tests to try and avoid gotchas.
This commit is contained in:
@@ -151,6 +151,24 @@ void TestBreakIterator::testGraphemeIteration()
|
|||||||
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
||||||
CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
|
CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
|
||||||
|
::rtl::OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
|
||||||
|
|
||||||
|
sal_Int32 nGraphemeCount = 0;
|
||||||
|
|
||||||
|
sal_Int32 nCurPos = 0;
|
||||||
|
while (nCurPos < aText.getLength())
|
||||||
|
{
|
||||||
|
sal_Int32 nCount2 = 1;
|
||||||
|
nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
|
||||||
|
i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
|
||||||
|
++nGraphemeCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//A test to ensure that certain ranges and codepoints that are categorized as
|
//A test to ensure that certain ranges and codepoints that are categorized as
|
||||||
|
@@ -111,6 +111,8 @@ public:
|
|||||||
|
|
||||||
sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
|
sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
|
||||||
sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
|
sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
|
||||||
|
|
||||||
|
sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SW_BREAKITER() SwBreakIt::Get()
|
#define SW_BREAKITER() SwBreakIt::Get()
|
||||||
|
@@ -41,19 +41,20 @@
|
|||||||
#include <sfx2/docfile.hxx>
|
#include <sfx2/docfile.hxx>
|
||||||
#include <sfx2/sfxmodelfactory.hxx>
|
#include <sfx2/sfxmodelfactory.hxx>
|
||||||
|
|
||||||
#include "init.hxx"
|
#include "breakit.hxx"
|
||||||
#include "swtypes.hxx"
|
|
||||||
#include "docstat.hxx"
|
|
||||||
#include "doc.hxx"
|
#include "doc.hxx"
|
||||||
#include "ndtxt.hxx"
|
|
||||||
#include "docsh.hxx"
|
#include "docsh.hxx"
|
||||||
#include "shellres.hxx"
|
#include "docstat.hxx"
|
||||||
#include "docufld.hxx"
|
#include "docufld.hxx"
|
||||||
#include "fmtanchr.hxx"
|
#include "fmtanchr.hxx"
|
||||||
#include "swscanner.hxx"
|
#include "init.hxx"
|
||||||
#include "swcrsr.hxx"
|
#include "ndtxt.hxx"
|
||||||
#include "swmodule.hxx"
|
|
||||||
#include "shellio.hxx"
|
#include "shellio.hxx"
|
||||||
|
#include "shellres.hxx"
|
||||||
|
#include "swcrsr.hxx"
|
||||||
|
#include "swscanner.hxx"
|
||||||
|
#include "swmodule.hxx"
|
||||||
|
#include "swtypes.hxx"
|
||||||
|
|
||||||
SO2_DECL_REF(SwDocShell)
|
SO2_DECL_REF(SwDocShell)
|
||||||
SO2_IMPL_REF(SwDocShell)
|
SO2_IMPL_REF(SwDocShell)
|
||||||
@@ -73,14 +74,15 @@ public:
|
|||||||
void testFileNameFields();
|
void testFileNameFields();
|
||||||
void testDocStat();
|
void testDocStat();
|
||||||
void testSwScanner();
|
void testSwScanner();
|
||||||
|
void testUserPerceivedCharCount();
|
||||||
void testGraphicAnchorDeletion();
|
void testGraphicAnchorDeletion();
|
||||||
|
|
||||||
CPPUNIT_TEST_SUITE(SwDocTest);
|
CPPUNIT_TEST_SUITE(SwDocTest);
|
||||||
CPPUNIT_TEST(randomTest);
|
CPPUNIT_TEST(randomTest);
|
||||||
CPPUNIT_TEST(testPageDescName);
|
CPPUNIT_TEST(testPageDescName);
|
||||||
CPPUNIT_TEST(testFileNameFields);
|
CPPUNIT_TEST(testFileNameFields);
|
||||||
CPPUNIT_TEST(testDocStat);
|
|
||||||
CPPUNIT_TEST(testSwScanner);
|
CPPUNIT_TEST(testSwScanner);
|
||||||
|
CPPUNIT_TEST(testUserPerceivedCharCount);
|
||||||
CPPUNIT_TEST(testGraphicAnchorDeletion);
|
CPPUNIT_TEST(testGraphicAnchorDeletion);
|
||||||
CPPUNIT_TEST_SUITE_END();
|
CPPUNIT_TEST_SUITE_END();
|
||||||
|
|
||||||
@@ -189,6 +191,28 @@ void SwDocTest::testDocStat()
|
|||||||
CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen);
|
CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//For UI character counts we should follow UAX#29 and display the user
|
||||||
|
//perceived characters, not the number of codepoints, nor the number of code
|
||||||
|
//units http://unicode.org/reports/tr29/
|
||||||
|
void SwDocTest::testUserPerceivedCharCount()
|
||||||
|
{
|
||||||
|
SwBreakIt *pBreakIter = SwBreakIt::Get();
|
||||||
|
|
||||||
|
//Grapheme example, two different unicode code-points perceived by the user as a single
|
||||||
|
//glyph
|
||||||
|
const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
|
||||||
|
::rtl::OUString sALEF_QAMATS(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
|
||||||
|
sal_Int32 nGraphemeCount = pBreakIter->getGraphemeCount(sALEF_QAMATS);
|
||||||
|
CPPUNIT_ASSERT_MESSAGE("Grapheme Count should be 1", nGraphemeCount == 1);
|
||||||
|
|
||||||
|
//Surrogate pair example, one single unicode code-point (U+1D11E)
|
||||||
|
//represented as two code units in UTF-8
|
||||||
|
const sal_Unicode GCLEF[] = { 0xD834, 0xDD1E };
|
||||||
|
::rtl::OUString sGCLEF(GCLEF, SAL_N_ELEMENTS(GCLEF));
|
||||||
|
sal_Int32 nCount = pBreakIter->getGraphemeCount(sGCLEF);
|
||||||
|
CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1);
|
||||||
|
}
|
||||||
|
|
||||||
//See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
|
//See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
|
||||||
void SwDocTest::testSwScanner()
|
void SwDocTest::testSwScanner()
|
||||||
{
|
{
|
||||||
|
@@ -33,6 +33,7 @@
|
|||||||
#include <unicode/uchar.h>
|
#include <unicode/uchar.h>
|
||||||
#include <com/sun/star/lang/XMultiServiceFactory.hpp>
|
#include <com/sun/star/lang/XMultiServiceFactory.hpp>
|
||||||
#include <com/sun/star/i18n/ScriptType.hdl>
|
#include <com/sun/star/i18n/ScriptType.hdl>
|
||||||
|
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
|
||||||
#include <unotools/localedatawrapper.hxx>
|
#include <unotools/localedatawrapper.hxx>
|
||||||
|
|
||||||
#include <editeng/unolingu.hxx>
|
#include <editeng/unolingu.hxx>
|
||||||
@@ -169,4 +170,20 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const
|
|||||||
return nRet;
|
return nRet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const
|
||||||
|
{
|
||||||
|
sal_Int32 nGraphemeCount = 0;
|
||||||
|
|
||||||
|
sal_Int32 nCurPos = 0;
|
||||||
|
while (nCurPos < rText.getLength())
|
||||||
|
{
|
||||||
|
sal_Int32 nCount2 = 1;
|
||||||
|
nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
|
||||||
|
i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
|
||||||
|
++nGraphemeCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nGraphemeCount;
|
||||||
|
}
|
||||||
|
|
||||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
||||||
|
@@ -1862,12 +1862,13 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
|
|||||||
if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
|
if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
|
||||||
{
|
{
|
||||||
++nTmpWords;
|
++nTmpWords;
|
||||||
nTmpCharsExcludingSpaces += aScanner.GetLen();
|
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nTmpChars = nExpandEnd - nExpandBegin - nNumOfMaskedChars;
|
nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin));
|
||||||
|
nTmpChars -= nNumOfMaskedChars;
|
||||||
|
|
||||||
// no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
|
// no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
|
||||||
// nor for mid-word selection - set scanner bClip = true at creation
|
// nor for mid-word selection - set scanner bClip = true at creation
|
||||||
@@ -1889,10 +1890,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
|
|||||||
while ( aScanner.NextWord() )
|
while ( aScanner.NextWord() )
|
||||||
{
|
{
|
||||||
++nTmpWords;
|
++nTmpWords;
|
||||||
nTmpCharsExcludingSpaces += aScanner.GetLen();
|
nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
|
||||||
}
|
}
|
||||||
|
|
||||||
nTmpChars += nNumStringLen;
|
nTmpChars = pBreakIt->getGraphemeCount(aNumString);
|
||||||
}
|
}
|
||||||
else if ( HasBullet() )
|
else if ( HasBullet() )
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user