Make BOM detection slightly more straightforward
Without taking system endianness and current stream endianness into account - just read and check single bytes. Change-Id: I9273d8f403caad7adb5e11cecc04e326919dad1f Reviewed-on: https://gerrit.libreoffice.org/c/core/+/126595 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
This commit is contained in:
@@ -718,52 +718,54 @@ void SvStream::StartReadingUnicodeText( rtl_TextEncoding eReadBomCharSet )
|
|||||||
eReadBomCharSet == RTL_TEXTENCODING_UTF8))
|
eReadBomCharSet == RTL_TEXTENCODING_UTF8))
|
||||||
return; // nothing to read
|
return; // nothing to read
|
||||||
|
|
||||||
bool bTryUtf8 = false;
|
const sal_uInt64 nOldPos = Tell();
|
||||||
sal_uInt16 nFlag(0);
|
bool bGetBack = true;
|
||||||
sal_sSize nBack = sizeof(nFlag);
|
unsigned char nFlag(0);
|
||||||
ReadUInt16( nFlag );
|
ReadUChar( nFlag );
|
||||||
switch ( nFlag )
|
switch ( nFlag )
|
||||||
{
|
{
|
||||||
case 0xfeff :
|
case 0xfe: // UTF-16BE?
|
||||||
// native UTF-16
|
|
||||||
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
|
||||||
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
|
|
||||||
nBack = 0;
|
|
||||||
break;
|
|
||||||
case 0xfffe :
|
|
||||||
// swapped UTF-16
|
|
||||||
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
||||||
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
|
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
|
||||||
{
|
{
|
||||||
SetEndian( m_nEndian == SvStreamEndian::BIG ? SvStreamEndian::LITTLE : SvStreamEndian::BIG );
|
ReadUChar(nFlag);
|
||||||
nBack = 0;
|
if (nFlag == 0xff)
|
||||||
|
{
|
||||||
|
SetEndian(SvStreamEndian::BIG);
|
||||||
|
bGetBack = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 0xefbb :
|
case 0xff: // UTF-16LE?
|
||||||
if (m_nEndian == SvStreamEndian::BIG &&
|
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
||||||
(eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
eReadBomCharSet == RTL_TEXTENCODING_UNICODE)
|
||||||
eReadBomCharSet == RTL_TEXTENCODING_UTF8))
|
{
|
||||||
bTryUtf8 = true;
|
ReadUChar(nFlag);
|
||||||
|
if (nFlag == 0xfe)
|
||||||
|
{
|
||||||
|
SetEndian(SvStreamEndian::LITTLE);
|
||||||
|
bGetBack = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 0xbbef :
|
case 0xef: // UTF-8?
|
||||||
if (m_nEndian == SvStreamEndian::LITTLE &&
|
if ( eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
||||||
(eReadBomCharSet == RTL_TEXTENCODING_DONTKNOW ||
|
eReadBomCharSet == RTL_TEXTENCODING_UTF8)
|
||||||
eReadBomCharSet == RTL_TEXTENCODING_UTF8))
|
{
|
||||||
bTryUtf8 = true;
|
ReadUChar(nFlag);
|
||||||
|
if (nFlag == 0xbb)
|
||||||
|
{
|
||||||
|
ReadUChar(nFlag);
|
||||||
|
if (nFlag == 0xbf)
|
||||||
|
bGetBack = false; // it is UTF-8
|
||||||
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
; // nothing
|
; // nothing
|
||||||
}
|
}
|
||||||
if (bTryUtf8)
|
if (bGetBack)
|
||||||
{
|
Seek(nOldPos); // no BOM, pure data
|
||||||
unsigned char nChar(0);
|
|
||||||
nBack += sizeof(nChar);
|
|
||||||
ReadUChar( nChar );
|
|
||||||
if (nChar == 0xbf)
|
|
||||||
nBack = 0; // it is UTF-8
|
|
||||||
}
|
|
||||||
if (nBack)
|
|
||||||
SeekRel( -nBack ); // no BOM, pure data
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sal_uInt64 SvStream::SeekRel(sal_Int64 const nPos)
|
sal_uInt64 SvStream::SeekRel(sal_Int64 const nPos)
|
||||||
|
Reference in New Issue
Block a user