INTEGRATION: CWS swqcore06 (1.6.26); FILE MERGED

2005/02/04 17:52:41 dvo 1.6.26.1: #i39255# fix byte order mark (BOM) recognition; add UTF-8 BOM
This commit is contained in:
Vladimir Glazounov
2005-02-22 09:06:12 +00:00
parent f75a527bea
commit 96725e8ceb

View File

@@ -2,9 +2,9 @@
* *
* $RCSfile: xml2utf.cxx,v $ * $RCSfile: xml2utf.cxx,v $
* *
* $Revision: 1.6 $ * $Revision: 1.7 $
* *
* last change: $Author: hr $ $Date: 2004-02-04 13:40:37 $ * last change: $Author: vg $ $Date: 2005-02-22 10:06:12 $
* *
* The Contents of this file are made available subject to the terms of * The Contents of this file are made available subject to the terms of
* either of the following licenses * either of the following licenses
@@ -257,7 +257,7 @@ sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8
sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
{ {
const sal_Int8 *pSource = seq.getConstArray(); const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
sal_Bool bReturn = sal_True; sal_Bool bReturn = sal_True;
if( seq.getLength() < 4 ) { if( seq.getLength() < 4 ) {
@@ -299,14 +299,14 @@ sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
} }
} }
} }
else if( 0xFE == static_cast<unsigned char> (pSource[0]) && else if( 0xFE == pSource[0] &&
0xFF == static_cast<unsigned char> (pSource[1]) ) { 0xFF == pSource[1] ) {
// UTF-16 big endian // UTF-16 big endian
// conversion is done so that encoding information can be easily extracted // conversion is done so that encoding information can be easily extracted
m_sEncoding = "utf-16"; m_sEncoding = "utf-16";
} }
else if( 0xFF == static_cast<unsigned char> (pSource[0]) && else if( 0xFF == pSource[0] &&
0xFE == static_cast<unsigned char> (pSource[1]) ) { 0xFE == pSource[1] ) {
// UTF-16 little endian // UTF-16 little endian
// conversion is done so that encoding information can be easily extracted // conversion is done so that encoding information can be easily extracted
m_sEncoding = "utf-16"; m_sEncoding = "utf-16";
@@ -334,6 +334,16 @@ sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
m_sEncoding = "utf-16"; m_sEncoding = "utf-16";
} }
else if( 0xEF == pSource[0] &&
0xBB == pSource[1] &&
0xBF == pSource[2] )
{
// UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
// The BOM is removed.
memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
seq.realloc( seq.getLength() - 3 );
m_sEncoding = "utf-8";
}
else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
// UCS-4 big endian // UCS-4 big endian
m_sEncoding = "ucs-4"; m_sEncoding = "ucs-4";