resolved fdo#48621 better handling of broken CSV files
* non-escaped (not doubled) quotes in quoted strings are regarded as broken representation and are taken literally, only a quote followed by a separator ends a field. If not being a separator themselves, trailing blanks between the ending quote and the separator are ignored, complementary to leading blanks between a separator and a quote. * quotes in a non-quoted string are taken literally
This commit is contained in:
@@ -573,6 +573,77 @@ void ScImportExport::WriteUnicodeOrByteEndl( SvStream& rStrm )
|
||||
}
|
||||
|
||||
|
||||
enum QuoteType
|
||||
{
|
||||
FIELDSTART_QUOTE,
|
||||
FIRST_QUOTE,
|
||||
SECOND_QUOTE,
|
||||
FIELDEND_QUOTE,
|
||||
DONTKNOW_QUOTE
|
||||
};
|
||||
|
||||
|
||||
/** Determine if *p is a quote that ends a quoted field.
|
||||
|
||||
Precondition: we are parsing a quoted field already and *p is a quote.
|
||||
|
||||
@return
|
||||
FIELDEND_QUOTE if end of field quote
|
||||
DONTKNOW_QUOTE anything else
|
||||
*/
|
||||
static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
|
||||
{
|
||||
// Due to broken CSV generators that don't double embedded quotes check if
|
||||
// a field separator immediately or with trailing spaces follows the quote,
|
||||
// only then end the field, or at end of string.
|
||||
while (p[1] == ' ')
|
||||
++p;
|
||||
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
|
||||
return FIELDEND_QUOTE;
|
||||
return DONTKNOW_QUOTE;
|
||||
}
|
||||
|
||||
|
||||
/** Determine if *p is a quote that is escaped by being doubled or ends a
|
||||
quoted field.
|
||||
|
||||
Precondition: *p is a quote.
|
||||
|
||||
@param nQuotes
|
||||
Quote characters encountered so far.
|
||||
Odd (after opening quote) means either no embedded quotes or only quote
|
||||
pairs so far.
|
||||
Even means either not in a quoted field or already one quote
|
||||
encountered, the first of a pair.
|
||||
|
||||
@return
|
||||
FIELDSTART_QUOTE if first quote in a field, either starting content or
|
||||
embedded so caller should check beforehand.
|
||||
FIRST_QUOTE if first of a doubled quote
|
||||
SECOND_QUOTE if second of a doubled quote
|
||||
FIELDEND_QUOTE if end of field quote
|
||||
DONTKNOW_QUOTE if an unescaped quote we don't consider as end of field,
|
||||
do not increment nQuotes in caller then!
|
||||
*/
|
||||
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
|
||||
const sal_Unicode* pSeps, sal_Unicode cStr )
|
||||
{
|
||||
if ((nQuotes % 2) == 0)
|
||||
{
|
||||
if (p[-1] == cStr)
|
||||
return SECOND_QUOTE;
|
||||
else
|
||||
{
|
||||
SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?");
|
||||
return FIELDSTART_QUOTE;
|
||||
}
|
||||
}
|
||||
if (p[1] == cStr)
|
||||
return FIRST_QUOTE;
|
||||
return lcl_isFieldEndQuote( p, pSeps);
|
||||
}
|
||||
|
||||
|
||||
/** Append characters of [p1,p2) to rField.
|
||||
|
||||
@returns TRUE if ok; FALSE if data overflow, truncated
|
||||
@@ -606,7 +677,7 @@ enum DoubledQuoteMode
|
||||
};
|
||||
|
||||
static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
|
||||
sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
|
||||
const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
|
||||
{
|
||||
p++; //! jump over opening quote
|
||||
bool bCont;
|
||||
@@ -621,7 +692,18 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
|
||||
if( *p == cStr )
|
||||
{
|
||||
if ( *++p != cStr )
|
||||
break;
|
||||
{
|
||||
// break or continue for loop
|
||||
if (eMode == DQM_ESCAPE)
|
||||
{
|
||||
if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
|
||||
break;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
// doubled quote char
|
||||
switch ( eMode )
|
||||
{
|
||||
@@ -815,6 +897,10 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
|
||||
{
|
||||
bool bOk = true;
|
||||
|
||||
sal_Unicode pSeps[2];
|
||||
pSeps[0] = cSep;
|
||||
pSeps[1] = 0;
|
||||
|
||||
SCCOL nStartCol = aRange.aStart.Col();
|
||||
SCROW nStartRow = aRange.aStart.Row();
|
||||
SCCOL nEndCol = aRange.aEnd.Col();
|
||||
@@ -843,7 +929,7 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
|
||||
aCell.Erase();
|
||||
if( *p == cStr )
|
||||
{
|
||||
p = lcl_ScanString( p, aCell, cStr, DQM_KEEP, bOverflowCell );
|
||||
p = lcl_ScanString( p, aCell, pSeps, cStr, DQM_KEEP, bOverflowCell );
|
||||
while( *p && *p != cSep )
|
||||
p++;
|
||||
if( *p )
|
||||
@@ -1277,7 +1363,7 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
|
||||
for( ;; )
|
||||
{
|
||||
aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
|
||||
if ( rStrm.IsEof() )
|
||||
if ( rStrm.IsEof() && aLine.isEmpty() )
|
||||
break;
|
||||
|
||||
sal_Int32 nLineLen = aLine.getLength();
|
||||
@@ -1445,7 +1531,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
|
||||
{
|
||||
rbIsQuoted = true;
|
||||
const sal_Unicode* p1;
|
||||
p1 = p = lcl_ScanString( p, rField, cStr, DQM_ESCAPE, rbOverflowCell );
|
||||
p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DQM_ESCAPE, rbOverflowCell );
|
||||
while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
|
||||
p++;
|
||||
// Append remaining unquoted and undelimited data (dirty, dirty) to
|
||||
@@ -2212,9 +2298,8 @@ inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
const String& rFieldSeparators, sal_Unicode cFieldQuote,
|
||||
bool bAllowBackslashEscape)
|
||||
rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
const String& rFieldSeparators, sal_Unicode cFieldQuote )
|
||||
{
|
||||
rtl::OUString aStr;
|
||||
rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
|
||||
@@ -2226,11 +2311,13 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
// See if the separator(s) include tab.
|
||||
bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL;
|
||||
|
||||
QuoteType eQuoteState = FIELDEND_QUOTE;
|
||||
bool bFieldStart = true;
|
||||
|
||||
sal_Int32 nLastOffset = 0;
|
||||
sal_Int32 nQuotes = 0;
|
||||
while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit)
|
||||
{
|
||||
bool bBackslashEscaped = false;
|
||||
const sal_Unicode *p, *pStart;
|
||||
p = pStart = aStr.getStr();
|
||||
p += nLastOffset;
|
||||
@@ -2248,25 +2335,66 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
break;
|
||||
}
|
||||
|
||||
if (*p == cFieldQuote && !bBackslashEscaped)
|
||||
++nQuotes;
|
||||
else if (bAllowBackslashEscape)
|
||||
if (*p == cFieldQuote)
|
||||
{
|
||||
if (*p == '\\')
|
||||
bBackslashEscaped = !bBackslashEscaped;
|
||||
if (bFieldStart)
|
||||
{
|
||||
++nQuotes;
|
||||
bFieldStart = false;
|
||||
eQuoteState = FIELDSTART_QUOTE;
|
||||
}
|
||||
// Do not detect a FIELDSTART_QUOTE if not in
|
||||
// bFieldStart mode, in which case for unquoted content
|
||||
// we are in FIELDEND_QUOTE state.
|
||||
else if (eQuoteState != FIELDEND_QUOTE)
|
||||
{
|
||||
eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
|
||||
// DONTKNOW_QUOTE is an embedded unescaped quote we
|
||||
// don't count for pairing.
|
||||
if (eQuoteState != DONTKNOW_QUOTE)
|
||||
++nQuotes;
|
||||
}
|
||||
}
|
||||
else if (eQuoteState == FIELDEND_QUOTE)
|
||||
{
|
||||
if (bFieldStart)
|
||||
// If blank is a separator it starts a field, if it
|
||||
// is not and thus maybe leading before quote we
|
||||
// are still at start of field regarding quotes.
|
||||
bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||
else
|
||||
bBackslashEscaped = false;
|
||||
bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*p == cFieldQuote && bFieldStart)
|
||||
{
|
||||
nQuotes = 1;
|
||||
eQuoteState = FIELDSTART_QUOTE;
|
||||
bFieldStart = false;
|
||||
}
|
||||
else if (eQuoteState == FIELDEND_QUOTE)
|
||||
{
|
||||
// This also skips leading blanks at beginning of line
|
||||
// if followed by a quote. It's debatable whether we
|
||||
// actually want that or not, but congruent with what
|
||||
// ScanNextFieldFromString() does.
|
||||
if (bFieldStart)
|
||||
bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||
else
|
||||
bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
|
||||
}
|
||||
}
|
||||
else if (*p == cFieldQuote && (p == pStart ||
|
||||
lcl_UnicodeStrChr( pSeps, p[-1])))
|
||||
nQuotes = 1;
|
||||
// A quote character inside a field content does not start
|
||||
// a quote.
|
||||
++p;
|
||||
}
|
||||
|
||||
if (nQuotes % 2 == 0)
|
||||
// We still have a (theoretical?) problem here if due to
|
||||
// nArbitraryLineLengthLimit we split a string right between a
|
||||
// doubled quote pair.
|
||||
break;
|
||||
else
|
||||
{
|
||||
|
@@ -208,11 +208,10 @@ public:
|
||||
within a field, the field content MUST be surrounded by
|
||||
cFieldQuote characters, and the opening cFieldQuote MUST be
|
||||
at the very start of a line or follow right behind a field
|
||||
separator with no extra characters in between. Anything,
|
||||
separator with no extra characters in between, with the
|
||||
exception of blanks contradictory to RFC 4180. Anything,
|
||||
including field separators and escaped quotes (by doubling
|
||||
them, or preceding them with a backslash if
|
||||
bAllowBackslashEscape==TRUE) may appear in a quoted
|
||||
field.
|
||||
them) may appear in a quoted field.
|
||||
|
||||
If bEmbeddedLineBreak==FALSE, nothing is parsed and the
|
||||
string returned is simply one ReadUniOrByteStringLine().
|
||||
@@ -223,11 +222,6 @@ public:
|
||||
@param cFieldQuote
|
||||
The quote character used.
|
||||
|
||||
@param bAllowBackslashEscape
|
||||
If TRUE, an embedded quote character inside a quoted
|
||||
field may also be escaped with a preceding backslash.
|
||||
Normally, quotes are escaped by doubling them.
|
||||
|
||||
check Stream::good() to detect IO problems during read
|
||||
|
||||
@ATTENTION
|
||||
@@ -247,9 +241,8 @@ public:
|
||||
may start under false preconditions.
|
||||
|
||||
*/
|
||||
SC_DLLPUBLIC rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
const String& rFieldSeparators, sal_Unicode cFieldQuote,
|
||||
bool bAllowBackslashEscape = false);
|
||||
SC_DLLPUBLIC rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
|
||||
const String& rFieldSeparators, sal_Unicode cFieldQuote );
|
||||
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user