resolved fdo#48621 better handling of broken CSV files
* non-escaped (not doubled) quotes in quoted strings are regarded as broken representation and are taken literally, only a quote followed by a separator ends a field. If not being a separator themselves, trailing blanks between the ending quote and the separator are ignored, complementary to leading blanks between a separator and a quote. * quotes in a non-quoted string are taken literally
This commit is contained in:
@@ -573,6 +573,77 @@ void ScImportExport::WriteUnicodeOrByteEndl( SvStream& rStrm )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum QuoteType
|
||||||
|
{
|
||||||
|
FIELDSTART_QUOTE,
|
||||||
|
FIRST_QUOTE,
|
||||||
|
SECOND_QUOTE,
|
||||||
|
FIELDEND_QUOTE,
|
||||||
|
DONTKNOW_QUOTE
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/** Determine if *p is a quote that ends a quoted field.
|
||||||
|
|
||||||
|
Precondition: we are parsing a quoted field already and *p is a quote.
|
||||||
|
|
||||||
|
@return
|
||||||
|
FIELDEND_QUOTE if end of field quote
|
||||||
|
DONTKNOW_QUOTE anything else
|
||||||
|
*/
|
||||||
|
static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
|
||||||
|
{
|
||||||
|
// Due to broken CSV generators that don't double embedded quotes check if
|
||||||
|
// a field separator immediately or with trailing spaces follows the quote,
|
||||||
|
// only then end the field, or at end of string.
|
||||||
|
while (p[1] == ' ')
|
||||||
|
++p;
|
||||||
|
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
|
||||||
|
return FIELDEND_QUOTE;
|
||||||
|
return DONTKNOW_QUOTE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Determine if *p is a quote that is escaped by being doubled or ends a
|
||||||
|
quoted field.
|
||||||
|
|
||||||
|
Precondition: *p is a quote.
|
||||||
|
|
||||||
|
@param nQuotes
|
||||||
|
Quote characters encountered so far.
|
||||||
|
Odd (after opening quote) means either no embedded quotes or only quote
|
||||||
|
pairs so far.
|
||||||
|
Even means either not in a quoted field or already one quote
|
||||||
|
encountered, the first of a pair.
|
||||||
|
|
||||||
|
@return
|
||||||
|
FIELDSTART_QUOTE if first quote in a field, either starting content or
|
||||||
|
embedded so caller should check beforehand.
|
||||||
|
FIRST_QUOTE if first of a doubled quote
|
||||||
|
SECOND_QUOTE if second of a doubled quote
|
||||||
|
FIELDEND_QUOTE if end of field quote
|
||||||
|
DONTKNOW_QUOTE if an unescaped quote we don't consider as end of field,
|
||||||
|
do not increment nQuotes in caller then!
|
||||||
|
*/
|
||||||
|
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
|
||||||
|
const sal_Unicode* pSeps, sal_Unicode cStr )
|
||||||
|
{
|
||||||
|
if ((nQuotes % 2) == 0)
|
||||||
|
{
|
||||||
|
if (p[-1] == cStr)
|
||||||
|
return SECOND_QUOTE;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?");
|
||||||
|
return FIELDSTART_QUOTE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (p[1] == cStr)
|
||||||
|
return FIRST_QUOTE;
|
||||||
|
return lcl_isFieldEndQuote( p, pSeps);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Append characters of [p1,p2) to rField.
|
/** Append characters of [p1,p2) to rField.
|
||||||
|
|
||||||
@returns TRUE if ok; FALSE if data overflow, truncated
|
@returns TRUE if ok; FALSE if data overflow, truncated
|
||||||
@@ -606,7 +677,7 @@ enum DoubledQuoteMode
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
|
static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
|
||||||
sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
|
const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
|
||||||
{
|
{
|
||||||
p++; //! jump over opening quote
|
p++; //! jump over opening quote
|
||||||
bool bCont;
|
bool bCont;
|
||||||
@@ -621,7 +692,18 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
|
|||||||
if( *p == cStr )
|
if( *p == cStr )
|
||||||
{
|
{
|
||||||
if ( *++p != cStr )
|
if ( *++p != cStr )
|
||||||
|
{
|
||||||
|
// break or continue for loop
|
||||||
|
if (eMode == DQM_ESCAPE)
|
||||||
|
{
|
||||||
|
if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
|
||||||
break;
|
break;
|
||||||
|
else
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
// doubled quote char
|
// doubled quote char
|
||||||
switch ( eMode )
|
switch ( eMode )
|
||||||
{
|
{
|
||||||
@@ -815,6 +897,10 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
|
|||||||
{
|
{
|
||||||
bool bOk = true;
|
bool bOk = true;
|
||||||
|
|
||||||
|
sal_Unicode pSeps[2];
|
||||||
|
pSeps[0] = cSep;
|
||||||
|
pSeps[1] = 0;
|
||||||
|
|
||||||
SCCOL nStartCol = aRange.aStart.Col();
|
SCCOL nStartCol = aRange.aStart.Col();
|
||||||
SCROW nStartRow = aRange.aStart.Row();
|
SCROW nStartRow = aRange.aStart.Row();
|
||||||
SCCOL nEndCol = aRange.aEnd.Col();
|
SCCOL nEndCol = aRange.aEnd.Col();
|
||||||
@@ -843,7 +929,7 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
|
|||||||
aCell.Erase();
|
aCell.Erase();
|
||||||
if( *p == cStr )
|
if( *p == cStr )
|
||||||
{
|
{
|
||||||
p = lcl_ScanString( p, aCell, cStr, DQM_KEEP, bOverflowCell );
|
p = lcl_ScanString( p, aCell, pSeps, cStr, DQM_KEEP, bOverflowCell );
|
||||||
while( *p && *p != cSep )
|
while( *p && *p != cSep )
|
||||||
p++;
|
p++;
|
||||||
if( *p )
|
if( *p )
|
||||||
@@ -1277,7 +1363,7 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
|
|||||||
for( ;; )
|
for( ;; )
|
||||||
{
|
{
|
||||||
aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
|
aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
|
||||||
if ( rStrm.IsEof() )
|
if ( rStrm.IsEof() && aLine.isEmpty() )
|
||||||
break;
|
break;
|
||||||
|
|
||||||
sal_Int32 nLineLen = aLine.getLength();
|
sal_Int32 nLineLen = aLine.getLength();
|
||||||
@@ -1445,7 +1531,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
|
|||||||
{
|
{
|
||||||
rbIsQuoted = true;
|
rbIsQuoted = true;
|
||||||
const sal_Unicode* p1;
|
const sal_Unicode* p1;
|
||||||
p1 = p = lcl_ScanString( p, rField, cStr, DQM_ESCAPE, rbOverflowCell );
|
p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DQM_ESCAPE, rbOverflowCell );
|
||||||
while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
|
while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
|
||||||
p++;
|
p++;
|
||||||
// Append remaining unquoted and undelimited data (dirty, dirty) to
|
// Append remaining unquoted and undelimited data (dirty, dirty) to
|
||||||
@@ -2213,8 +2299,7 @@ inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
||||||
const String& rFieldSeparators, sal_Unicode cFieldQuote,
|
const String& rFieldSeparators, sal_Unicode cFieldQuote )
|
||||||
bool bAllowBackslashEscape)
|
|
||||||
{
|
{
|
||||||
rtl::OUString aStr;
|
rtl::OUString aStr;
|
||||||
rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
|
rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
|
||||||
@@ -2226,11 +2311,13 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
|||||||
// See if the separator(s) include tab.
|
// See if the separator(s) include tab.
|
||||||
bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL;
|
bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL;
|
||||||
|
|
||||||
|
QuoteType eQuoteState = FIELDEND_QUOTE;
|
||||||
|
bool bFieldStart = true;
|
||||||
|
|
||||||
sal_Int32 nLastOffset = 0;
|
sal_Int32 nLastOffset = 0;
|
||||||
sal_Int32 nQuotes = 0;
|
sal_Int32 nQuotes = 0;
|
||||||
while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit)
|
while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit)
|
||||||
{
|
{
|
||||||
bool bBackslashEscaped = false;
|
|
||||||
const sal_Unicode *p, *pStart;
|
const sal_Unicode *p, *pStart;
|
||||||
p = pStart = aStr.getStr();
|
p = pStart = aStr.getStr();
|
||||||
p += nLastOffset;
|
p += nLastOffset;
|
||||||
@@ -2248,25 +2335,66 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*p == cFieldQuote && !bBackslashEscaped)
|
if (*p == cFieldQuote)
|
||||||
++nQuotes;
|
|
||||||
else if (bAllowBackslashEscape)
|
|
||||||
{
|
{
|
||||||
if (*p == '\\')
|
if (bFieldStart)
|
||||||
bBackslashEscaped = !bBackslashEscaped;
|
{
|
||||||
|
++nQuotes;
|
||||||
|
bFieldStart = false;
|
||||||
|
eQuoteState = FIELDSTART_QUOTE;
|
||||||
|
}
|
||||||
|
// Do not detect a FIELDSTART_QUOTE if not in
|
||||||
|
// bFieldStart mode, in which case for unquoted content
|
||||||
|
// we are in FIELDEND_QUOTE state.
|
||||||
|
else if (eQuoteState != FIELDEND_QUOTE)
|
||||||
|
{
|
||||||
|
eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
|
||||||
|
// DONTKNOW_QUOTE is an embedded unescaped quote we
|
||||||
|
// don't count for pairing.
|
||||||
|
if (eQuoteState != DONTKNOW_QUOTE)
|
||||||
|
++nQuotes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (eQuoteState == FIELDEND_QUOTE)
|
||||||
|
{
|
||||||
|
if (bFieldStart)
|
||||||
|
// If blank is a separator it starts a field, if it
|
||||||
|
// is not and thus maybe leading before quote we
|
||||||
|
// are still at start of field regarding quotes.
|
||||||
|
bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||||
else
|
else
|
||||||
bBackslashEscaped = false;
|
bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (*p == cFieldQuote && (p == pStart ||
|
else
|
||||||
lcl_UnicodeStrChr( pSeps, p[-1])))
|
{
|
||||||
|
if (*p == cFieldQuote && bFieldStart)
|
||||||
|
{
|
||||||
nQuotes = 1;
|
nQuotes = 1;
|
||||||
|
eQuoteState = FIELDSTART_QUOTE;
|
||||||
|
bFieldStart = false;
|
||||||
|
}
|
||||||
|
else if (eQuoteState == FIELDEND_QUOTE)
|
||||||
|
{
|
||||||
|
// This also skips leading blanks at beginning of line
|
||||||
|
// if followed by a quote. It's debatable whether we
|
||||||
|
// actually want that or not, but congruent with what
|
||||||
|
// ScanNextFieldFromString() does.
|
||||||
|
if (bFieldStart)
|
||||||
|
bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||||
|
else
|
||||||
|
bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
// A quote character inside a field content does not start
|
// A quote character inside a field content does not start
|
||||||
// a quote.
|
// a quote.
|
||||||
++p;
|
++p;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nQuotes % 2 == 0)
|
if (nQuotes % 2 == 0)
|
||||||
|
// We still have a (theoretical?) problem here if due to
|
||||||
|
// nArbitraryLineLengthLimit we split a string right between a
|
||||||
|
// doubled quote pair.
|
||||||
break;
|
break;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -208,11 +208,10 @@ public:
|
|||||||
within a field, the field content MUST be surrounded by
|
within a field, the field content MUST be surrounded by
|
||||||
cFieldQuote characters, and the opening cFieldQuote MUST be
|
cFieldQuote characters, and the opening cFieldQuote MUST be
|
||||||
at the very start of a line or follow right behind a field
|
at the very start of a line or follow right behind a field
|
||||||
separator with no extra characters in between. Anything,
|
separator with no extra characters in between, with the
|
||||||
|
exception of blanks contradictory to RFC 4180. Anything,
|
||||||
including field separators and escaped quotes (by doubling
|
including field separators and escaped quotes (by doubling
|
||||||
them, or preceding them with a backslash if
|
them) may appear in a quoted field.
|
||||||
bAllowBackslashEscape==TRUE) may appear in a quoted
|
|
||||||
field.
|
|
||||||
|
|
||||||
If bEmbeddedLineBreak==FALSE, nothing is parsed and the
|
If bEmbeddedLineBreak==FALSE, nothing is parsed and the
|
||||||
string returned is simply one ReadUniOrByteStringLine().
|
string returned is simply one ReadUniOrByteStringLine().
|
||||||
@@ -223,11 +222,6 @@ public:
|
|||||||
@param cFieldQuote
|
@param cFieldQuote
|
||||||
The quote character used.
|
The quote character used.
|
||||||
|
|
||||||
@param bAllowBackslashEscape
|
|
||||||
If TRUE, an embedded quote character inside a quoted
|
|
||||||
field may also be escaped with a preceding backslash.
|
|
||||||
Normally, quotes are escaped by doubling them.
|
|
||||||
|
|
||||||
check Stream::good() to detect IO problems during read
|
check Stream::good() to detect IO problems during read
|
||||||
|
|
||||||
@ATTENTION
|
@ATTENTION
|
||||||
@@ -248,8 +242,7 @@ public:
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
SC_DLLPUBLIC rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
SC_DLLPUBLIC rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
||||||
const String& rFieldSeparators, sal_Unicode cFieldQuote,
|
const String& rFieldSeparators, sal_Unicode cFieldQuote );
|
||||||
bool bAllowBackslashEscape = false);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user