simpleExtract() with variants

rsc needs to resolve all known defined languages without access to
liblangtag because that would need
/usr/local/share/liblangtag/language-subtag-registry.xml so we'd end up
with a bootstrap problem (or would have to pass the database path).

Change-Id: I6b966d45080da26cb89169cdb40cd8a58c04a276
This commit is contained in:
Eike Rathke
2013-09-02 18:01:13 +02:00
parent 8614ff2993
commit 49656398d4
2 changed files with 131 additions and 37 deletions

View File

@@ -155,6 +155,7 @@ LanguageTag::LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize
mbCachedLanguage( false), mbCachedLanguage( false),
mbCachedScript( false), mbCachedScript( false),
mbCachedCountry( false), mbCachedCountry( false),
mbCachedVariants( false),
mbIsFallback( false) mbIsFallback( false)
{ {
if (bCanonicalize) if (bCanonicalize)
@@ -178,6 +179,7 @@ LanguageTag::LanguageTag( const com::sun::star::lang::Locale & rLocale )
mbCachedLanguage( false), mbCachedLanguage( false),
mbCachedScript( false), mbCachedScript( false),
mbCachedCountry( false), mbCachedCountry( false),
mbCachedVariants( false),
mbIsFallback( false) mbIsFallback( false)
{ {
} }
@@ -198,6 +200,7 @@ LanguageTag::LanguageTag( LanguageType nLanguage )
mbCachedLanguage( false), mbCachedLanguage( false),
mbCachedScript( false), mbCachedScript( false),
mbCachedCountry( false), mbCachedCountry( false),
mbCachedVariants( false),
mbIsFallback( false) mbIsFallback( false)
{ {
} }
@@ -220,6 +223,7 @@ LanguageTag::LanguageTag( const OUString& rBcp47, const OUString& rLanguage,
mbCachedLanguage( false), mbCachedLanguage( false),
mbCachedScript( false), mbCachedScript( false),
mbCachedCountry( false), mbCachedCountry( false),
mbCachedVariants( false),
mbIsFallback( false) mbIsFallback( false)
{ {
if (!mbSystemLocale && !mbInitializedBcp47) if (!mbSystemLocale && !mbInitializedBcp47)
@@ -257,6 +261,7 @@ LanguageTag::LanguageTag( const rtl_Locale & rLocale )
mbCachedLanguage( false), mbCachedLanguage( false),
mbCachedScript( false), mbCachedScript( false),
mbCachedCountry( false), mbCachedCountry( false),
mbCachedVariants( false),
mbIsFallback( false) mbIsFallback( false)
{ {
convertFromRtlLocale(); convertFromRtlLocale();
@@ -284,6 +289,7 @@ LanguageTag::LanguageTag( const LanguageTag & rLanguageTag )
mbCachedLanguage( rLanguageTag.mbCachedLanguage), mbCachedLanguage( rLanguageTag.mbCachedLanguage),
mbCachedScript( rLanguageTag.mbCachedScript), mbCachedScript( rLanguageTag.mbCachedScript),
mbCachedCountry( rLanguageTag.mbCachedCountry), mbCachedCountry( rLanguageTag.mbCachedCountry),
mbCachedVariants( rLanguageTag.mbCachedVariants),
mbIsFallback( rLanguageTag.mbIsFallback) mbIsFallback( rLanguageTag.mbIsFallback)
{ {
if (mpImplLangtag) if (mpImplLangtag)
@@ -313,6 +319,7 @@ LanguageTag& LanguageTag::operator=( const LanguageTag & rLanguageTag )
mbCachedLanguage = rLanguageTag.mbCachedLanguage; mbCachedLanguage = rLanguageTag.mbCachedLanguage;
mbCachedScript = rLanguageTag.mbCachedScript; mbCachedScript = rLanguageTag.mbCachedScript;
mbCachedCountry = rLanguageTag.mbCachedCountry; mbCachedCountry = rLanguageTag.mbCachedCountry;
mbCachedVariants = rLanguageTag.mbCachedVariants;
mbIsFallback = rLanguageTag.mbIsFallback; mbIsFallback = rLanguageTag.mbIsFallback;
if (mpImplLangtag) if (mpImplLangtag)
theDataRef::get().incRef(); theDataRef::get().incRef();
@@ -360,6 +367,7 @@ void LanguageTag::resetVars()
mbCachedLanguage = false; mbCachedLanguage = false;
mbCachedScript = false; mbCachedScript = false;
mbCachedCountry = false; mbCachedCountry = false;
mbCachedVariants = false;
mbIsFallback = false; mbIsFallback = false;
} }
@@ -441,20 +449,22 @@ bool LanguageTag::canonicalize()
// and want to determine if parsing it would be possible // and want to determine if parsing it would be possible
// without using liblangtag just to see if it is a simple known // without using liblangtag just to see if it is a simple known
// locale. // locale.
OUString aLanguage, aScript, aCountry; OUString aLanguage, aScript, aCountry, aVariants;
Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry); Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
if (eExt != EXTRACTED_NONE) if (eExt != EXTRACTED_NONE)
{ {
if (eExt == EXTRACTED_LSC) if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV)
{ {
// Rebuild bcp47 with proper casing of tags. // Rebuild bcp47 with proper casing of tags.
OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() + OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() +
1 + aCountry.getLength()); 1 + aCountry.getLength() + 1 + aVariants.getLength());
aBuf.append( aLanguage); aBuf.append( aLanguage);
if (!aScript.isEmpty()) if (!aScript.isEmpty())
aBuf.append("-" + aScript); aBuf.append("-" + aScript);
if (!aCountry.isEmpty()) if (!aCountry.isEmpty())
aBuf.append("-" + aCountry); aBuf.append("-" + aCountry);
if (!aVariants.isEmpty())
aBuf.append("-" + aVariants);
OUString aStr( aBuf.makeStringAndClear()); OUString aStr( aBuf.makeStringAndClear());
if (maBcp47 != aStr) if (maBcp47 != aStr)
@@ -749,7 +759,7 @@ OUString LanguageTag::getLanguageFromLangtag()
} }
else else
{ {
if (mbCachedLanguage || cacheSimpleLSC()) if (mbCachedLanguage || cacheSimpleLSCV())
aLanguage = maCachedLanguage; aLanguage = maCachedLanguage;
} }
return aLanguage; return aLanguage;
@@ -775,7 +785,7 @@ OUString LanguageTag::getScriptFromLangtag()
} }
else else
{ {
if (mbCachedScript || cacheSimpleLSC()) if (mbCachedScript || cacheSimpleLSCV())
aScript = maCachedScript; aScript = maCachedScript;
} }
return aScript; return aScript;
@@ -808,7 +818,7 @@ OUString LanguageTag::getRegionFromLangtag()
} }
else else
{ {
if (mbCachedCountry || cacheSimpleLSC()) if (mbCachedCountry || cacheSimpleLSCV())
aRegion = maCachedCountry; aRegion = maCachedCountry;
} }
return aRegion; return aRegion;
@@ -840,6 +850,11 @@ OUString LanguageTag::getVariantsFromLangtag()
} }
} }
} }
else
{
if (mbCachedVariants || cacheSimpleLSCV())
aVariants = maCachedVariants;
}
return aVariants; return aVariants;
} }
@@ -1020,7 +1035,12 @@ OUString LanguageTag::getRegion() const
OUString LanguageTag::getVariants() const OUString LanguageTag::getVariants() const
{ {
return const_cast<LanguageTag*>(this)->getVariantsFromLangtag(); if (!mbCachedVariants)
{
maCachedVariants = const_cast<LanguageTag*>(this)->getVariantsFromLangtag();
mbCachedVariants = true;
}
return maCachedVariants;
} }
@@ -1055,16 +1075,18 @@ bool LanguageTag::hasScript() const
} }
bool LanguageTag::cacheSimpleLSC() bool LanguageTag::cacheSimpleLSCV()
{ {
OUString aLanguage, aScript, aCountry; OUString aLanguage, aScript, aCountry, aVariants;
bool bRet = (simpleExtract( maBcp47, aLanguage, aScript, aCountry) == EXTRACTED_LSC); Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV);
if (bRet) if (bRet)
{ {
maCachedLanguage = aLanguage; maCachedLanguage = aLanguage;
maCachedScript = aScript; maCachedScript = aScript;
maCachedCountry = aCountry; maCachedCountry = aCountry;
mbCachedLanguage = mbCachedScript = mbCachedCountry = true; maCachedVariants = aVariants;
mbCachedLanguage = mbCachedScript = mbCachedCountry = mbCachedVariants = true;
} }
return bRet; return bRet;
} }
@@ -1275,11 +1297,14 @@ bool LanguageTag::operator!=( const LanguageTag & rLanguageTag ) const
// static // static
LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47, LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47,
OUString& rLanguage, OUString& rScript, OUString& rCountry ) OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants )
{ {
Extraction eRet = EXTRACTED_NONE; Extraction eRet = EXTRACTED_NONE;
const sal_Int32 nLen = rBcp47.getLength(); const sal_Int32 nLen = rBcp47.getLength();
const sal_Int32 nHyph1 = rBcp47.indexOf( '-'); const sal_Int32 nHyph1 = rBcp47.indexOf( '-');
const sal_Int32 nHyph2 = (nHyph1 < 0 ? -1 : rBcp47.indexOf( '-', nHyph1 + 1));
const sal_Int32 nHyph3 = (nHyph2 < 0 ? -1 : rBcp47.indexOf( '-', nHyph2 + 1));
const sal_Int32 nHyph4 = (nHyph3 < 0 ? -1 : rBcp47.indexOf( '-', nHyph3 + 1));
if (nLen == 1 && rBcp47[0] == '*') // * the dreaded jolly joker if (nLen == 1 && rBcp47[0] == '*') // * the dreaded jolly joker
{ {
// It's f*d up but we need to recognize this. // It's f*d up but we need to recognize this.
@@ -1290,34 +1315,96 @@ LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47,
// x-... privateuse tags MUST be known to us by definition. // x-... privateuse tags MUST be known to us by definition.
eRet = EXTRACTED_X; eRet = EXTRACTED_X;
} }
else if ((nLen == 2 || nLen == 3) && nHyph1 < 0) // ll or lll else if (nLen == 2 || nLen == 3) // ll or lll
{ {
rLanguage = rBcp47.toAsciiLowerCase(); if (nHyph1 < 0)
rScript = rCountry = OUString(); {
eRet = EXTRACTED_LSC; rLanguage = rBcp47.toAsciiLowerCase();
rScript = rCountry = rVariants = OUString();
eRet = EXTRACTED_LSC;
}
} }
else if ( (nLen == 5 && nHyph1 == 2) // ll-CC else if ( (nHyph1 == 2 && nLen == 5) // ll-CC
|| (nLen == 6 && nHyph1 == 3)) // lll-CC || (nHyph1 == 3 && nLen == 6)) // lll-CC
{ {
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); if (nHyph2 < 0)
rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); {
rScript = OUString(); rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
eRet = EXTRACTED_LSC; rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
rScript = rVariants = OUString();
eRet = EXTRACTED_LSC;
}
} }
else if ( (nHyph1 == 2 && nLen == 10) // ll-Ssss-CC check else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss
|| (nHyph1 == 3 && nLen == 11)) // lll-Ssss-CC check || (nHyph1 == 3 && nLen == 8)) // lll-Ssss
{ {
const sal_Int32 nHyph2 = rBcp47.indexOf( '-', nHyph1 + 1); /* TODO: also accept a (DIGIT 3*ALNUM) vvvv variant instead of Ssss */
if (nHyph2 == nHyph1 + 5) if (nHyph2 < 0)
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rVariants = OUString();
eRet = EXTRACTED_LSC;
}
}
else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 10) // ll-Ssss-CC
|| (nHyph1 == 3 && nHyph2 == 8 && nLen == 11)) // lll-Ssss-CC
{
if (nHyph3 < 0)
{ {
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
rVariants = OUString();
eRet = EXTRACTED_LSC; eRet = EXTRACTED_LSC;
} }
} }
else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...]
|| (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...]
{
if (nHyph4 < 0 || (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9))
{
rVariants = rBcp47.copy( nHyph3 + 1);
if (nHyph4 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength()))
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
eRet = EXTRACTED_LV;
}
}
}
else if ( (nHyph1 == 2 && nHyph2 == 5 && nLen >= 10) // ll-CC-vvvv[vvvv][-...]
|| (nHyph1 == 3 && nHyph2 == 6 && nLen >= 11)) // lll-CC-vvvv[vvvv][-...]
{
if (nHyph3 < 0 || (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9))
{
rVariants = rBcp47.copy( nHyph2 + 1);
if (nHyph3 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength()))
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
rScript = OUString();
eRet = EXTRACTED_LV;
}
}
}
else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...]
|| (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...]
{
if (nHyph2 < 0 || (nHyph2 - nHyph1 > 5 && nHyph2 - nHyph1 <= 9))
{
rVariants = rBcp47.copy( nHyph1 + 1);
if (nHyph2 < 0 && (rVariants.getLength() < 5 || 8 < rVariants.getLength()))
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rCountry = OUString();
eRet = EXTRACTED_LV;
}
}
}
if (eRet == EXTRACTED_NONE) if (eRet == EXTRACTED_NONE)
rLanguage = rScript = rCountry = OUString(); rLanguage = rScript = rCountry = rVariants = OUString();
return eRet; return eRet;
} }

View File

@@ -449,6 +449,7 @@ private:
mutable OUString maCachedLanguage; ///< cache getLanguage() mutable OUString maCachedLanguage; ///< cache getLanguage()
mutable OUString maCachedScript; ///< cache getScript() mutable OUString maCachedScript; ///< cache getScript()
mutable OUString maCachedCountry; ///< cache getCountry() mutable OUString maCachedCountry; ///< cache getCountry()
mutable OUString maCachedVariants; ///< cache getVariants()
mutable void* mpImplLangtag; ///< actually lt_tag_t pointer, encapsulated mutable void* mpImplLangtag; ///< actually lt_tag_t pointer, encapsulated
mutable LanguageType mnLangID; mutable LanguageType mnLangID;
mutable Decision meIsValid; mutable Decision meIsValid;
@@ -462,6 +463,7 @@ private:
mutable bool mbCachedLanguage : 1; mutable bool mbCachedLanguage : 1;
mutable bool mbCachedScript : 1; mutable bool mbCachedScript : 1;
mutable bool mbCachedCountry : 1; mutable bool mbCachedCountry : 1;
mutable bool mbCachedVariants : 1;
bool mbIsFallback : 1; bool mbIsFallback : 1;
void convertLocaleToBcp47(); void convertLocaleToBcp47();
@@ -489,12 +491,12 @@ private:
void resetVars(); void resetVars();
/** Obtain Language, Script and Country via simpleExtract() and assign them /** Obtain Language, Script, Country and Variants via simpleExtract() and
to the cached variables if successful. assign them to the cached variables if successful.
@return return of simpleExtract() @return return of simpleExtract()
*/ */
bool cacheSimpleLSC(); bool cacheSimpleLSCV();
static bool isIsoLanguage( const OUString& rLanguage ); static bool isIsoLanguage( const OUString& rLanguage );
static bool isIsoScript( const OUString& rScript ); static bool isIsoScript( const OUString& rScript );
@@ -504,23 +506,28 @@ private:
{ {
EXTRACTED_NONE, EXTRACTED_NONE,
EXTRACTED_LSC, EXTRACTED_LSC,
EXTRACTED_LV,
EXTRACTED_X, EXTRACTED_X,
EXTRACTED_X_JOKER EXTRACTED_X_JOKER
}; };
/** Of a simple language tag of the form lll[-Ssss][-CC] (i.e. one that /** Of a language tag of the form lll[-Ssss][-CC][-vvvvvvvv] extract the
would fulfill the isIsoODF() condition) extract the portions. portions.
Does not check case or content! Does not check case or content!
@return EXTRACTED_LSC if simple tag was detected, EXTRACTED_X if x-... @return EXTRACTED_LSC if simple tag was detected (i.e. one that
privateuse tag was detected, EXTRACTED_X_JOKER if "*" joker was would fulfill the isIsoODF() condition),
detected, else EXTRACTED_NONE. EXTRACTED_LV if a tag with variant was detected,
EXTRACTED_X if x-... privateuse tag was detected,
EXTRACTED_X_JOKER if "*" joker was detected,
EXTRACTED_NONE else.
*/ */
static Extraction simpleExtract( const OUString& rBcp47, static Extraction simpleExtract( const OUString& rBcp47,
OUString& rLanguage, OUString& rLanguage,
OUString& rScript, OUString& rScript,
OUString& rCountry ); OUString& rCountry,
OUString& rVariants );
}; };