resolved fdo#48621 better handling of broken CSV files

* non-escaped (not doubled) quotes in quoted strings are regarded as broken representation and are taken literally, only a quote followed by a separator ends a field. If not being a separator themselves, trailing blanks between the ending quote and the separator are ignored, complementary to leading blanks between a separator and a quote. * quotes in a non-quoted string are taken literally
2012-04-14 18:57:31 +02:00
parent 95cc5de63b
commit 7928b65196
2 changed files with 151 additions and 30 deletions
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -573,6 +573,77 @@ void ScImportExport::WriteUnicodeOrByteEndl( SvStream& rStrm )
 }
 enum QuoteType
 {
    FIELDSTART_QUOTE,
    FIRST_QUOTE,
    SECOND_QUOTE,
    FIELDEND_QUOTE,
    DONTKNOW_QUOTE
 };
 /** Determine if *p is a quote that ends a quoted field.
    Precondition: we are parsing a quoted field already and *p is a quote.
    @return
        FIELDEND_QUOTE if end of field quote
        DONTKNOW_QUOTE anything else
 */
 static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
 {
    // Due to broken CSV generators that don't double embedded quotes check if
    // a field separator immediately or with trailing spaces follows the quote,
    // only then end the field, or at end of string.
    while (p[1] == ' ')
        ++p;
    if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
        return FIELDEND_QUOTE;
    return DONTKNOW_QUOTE;
 }
 /** Determine if *p is a quote that is escaped by being doubled or ends a
    quoted field.
    Precondition: *p is a quote.
    @param nQuotes
        Quote characters encountered so far.
        Odd (after opening quote) means either no embedded quotes or only quote
        pairs so far.
        Even means either not in a quoted field or already one quote
        encountered, the first of a pair.
    @return
        FIELDSTART_QUOTE if first quote in a field, either starting content or
                            embedded so caller should check beforehand.
        FIRST_QUOTE      if first of a doubled quote
        SECOND_QUOTE     if second of a doubled quote
        FIELDEND_QUOTE   if end of field quote
        DONTKNOW_QUOTE   if an unescaped quote we don't consider as end of field,
                            do not increment nQuotes in caller then!
 */
 static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
        const sal_Unicode* pSeps, sal_Unicode cStr )
 {
    if ((nQuotes % 2) == 0)
    {
        if (p[-1] == cStr)
            return SECOND_QUOTE;
        else
        {
            SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?");
            return FIELDSTART_QUOTE;
        }
    }
    if (p[1] == cStr)
        return FIRST_QUOTE;
    return lcl_isFieldEndQuote( p, pSeps);
 }
 /** Append characters of [p1,p2) to rField.
    @returns TRUE if ok; FALSE if data overflow, truncated
@@ -606,7 +677,7 @@ enum DoubledQuoteMode
 };
 static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
-            sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
+            const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
 {
    p++;    //! jump over opening quote
    bool bCont;
@@ -621,7 +692,18 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
            if( *p == cStr )
            {
                if ( *++p != cStr )
                {
                    // break or continue for loop
                    if (eMode == DQM_ESCAPE)
                    {
                        if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
                            break;
                        else
                            continue;
                    }
                    else
                        break;
                }
                // doubled quote char
                switch ( eMode )
                {
@@ -815,6 +897,10 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
 {
    bool bOk = true;
    sal_Unicode pSeps[2];
    pSeps[0] = cSep;
    pSeps[1] = 0;
    SCCOL nStartCol = aRange.aStart.Col();
    SCROW nStartRow = aRange.aStart.Row();
    SCCOL nEndCol = aRange.aEnd.Col();
@@ -843,7 +929,7 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
                aCell.Erase();
                if( *p == cStr )
                {
-                    p = lcl_ScanString( p, aCell, cStr, DQM_KEEP, bOverflowCell );
+                    p = lcl_ScanString( p, aCell, pSeps, cStr, DQM_KEEP, bOverflowCell );
                    while( *p && *p != cSep )
                        p++;
                    if( *p )
@@ -1277,7 +1363,7 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
        for( ;; )
        {
            aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
-            if ( rStrm.IsEof() )
+            if ( rStrm.IsEof() && aLine.isEmpty() )
                break;
            sal_Int32 nLineLen = aLine.getLength();
@@ -1445,7 +1531,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
    {
        rbIsQuoted = true;
        const sal_Unicode* p1;
-        p1 = p = lcl_ScanString( p, rField, cStr, DQM_ESCAPE, rbOverflowCell );
+        p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DQM_ESCAPE, rbOverflowCell );
        while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
            p++;
        // Append remaining unquoted and undelimited data (dirty, dirty) to
@@ -2213,8 +2299,7 @@ inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr,
 }
 rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        const String& rFieldSeparators, sal_Unicode cFieldQuote,
+        const String& rFieldSeparators, sal_Unicode cFieldQuote )
        bool bAllowBackslashEscape)
 {
    rtl::OUString aStr;
    rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
@@ -2226,11 +2311,13 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
        // See if the separator(s) include tab.
        bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL;
        QuoteType eQuoteState = FIELDEND_QUOTE;
        bool bFieldStart = true;
        sal_Int32 nLastOffset = 0;
        sal_Int32 nQuotes = 0;
        while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit)
        {
            bool bBackslashEscaped = false;
            const sal_Unicode *p, *pStart;
            p = pStart = aStr.getStr();
            p += nLastOffset;
@@ -2248,25 +2335,66 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
                        break;
                    }
-                    if (*p == cFieldQuote && !bBackslashEscaped)
+                    if (*p == cFieldQuote)
                        ++nQuotes;
                    else if (bAllowBackslashEscape)
                    {
-                        if (*p == '\\')
+                        if (bFieldStart)
-                            bBackslashEscaped = !bBackslashEscaped;
+                        {
                            ++nQuotes;
                            bFieldStart = false;
                            eQuoteState = FIELDSTART_QUOTE;
                        }
                        // Do not detect a FIELDSTART_QUOTE if not in
                        // bFieldStart mode, in which case for unquoted content
                        // we are in FIELDEND_QUOTE state.
                        else if (eQuoteState != FIELDEND_QUOTE)
                        {
                            eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
                            // DONTKNOW_QUOTE is an embedded unescaped quote we
                            // don't count for pairing.
                            if (eQuoteState != DONTKNOW_QUOTE)
                                ++nQuotes;
                        }
                    }
                    else if (eQuoteState == FIELDEND_QUOTE)
                    {
                        if (bFieldStart)
                            // If blank is a separator it starts a field, if it
                            // is not and thus maybe leading before quote we
                            // are still at start of field regarding quotes.
                            bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
                        else
-                            bBackslashEscaped = false;
+                            bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
                    }
                }
-                else if (*p == cFieldQuote && (p == pStart ||
+                else
-                            lcl_UnicodeStrChr( pSeps, p[-1])))
+                {
                    if (*p == cFieldQuote && bFieldStart)
                    {
                        nQuotes = 1;
                        eQuoteState = FIELDSTART_QUOTE;
                        bFieldStart = false;
                    }
                    else if (eQuoteState == FIELDEND_QUOTE)
                    {
                        // This also skips leading blanks at beginning of line
                        // if followed by a quote. It's debatable whether we
                        // actually want that or not, but congruent with what
                        // ScanNextFieldFromString() does.
                        if (bFieldStart)
                            bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
                        else
                            bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
                    }
                }
                // A quote character inside a field content does not start
                // a quote.
                ++p;
            }
            if (nQuotes % 2 == 0)
                // We still have a (theoretical?) problem here if due to
                // nArbitraryLineLengthLimit we split a string right between a
                // doubled quote pair.
                break;
            else
            {
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -208,11 +208,10 @@ public:
    within a field, the field content MUST be surrounded by
    cFieldQuote characters, and the opening cFieldQuote MUST be
    at the very start of a line or follow right behind a field
-    separator with no extra characters in between. Anything,
+    separator with no extra characters in between, with the
    exception of blanks contradictory to RFC 4180. Anything,
    including field separators and escaped quotes (by doubling
-    them, or preceding them with a backslash if
+    them) may appear in a quoted field.
    bAllowBackslashEscape==TRUE) may appear in a quoted
    field.
    If bEmbeddedLineBreak==FALSE, nothing is parsed and the
    string returned is simply one ReadUniOrByteStringLine().
@@ -223,11 +222,6 @@ public:
    @param cFieldQuote
    The quote character used.
    @param bAllowBackslashEscape
    If TRUE, an embedded quote character inside a quoted
    field may also be escaped with a preceding backslash.
    Normally, quotes are escaped by doubling them.
    check Stream::good() to detect IO problems during read
    @ATTENTION
@@ -248,8 +242,7 @@ public:
  */
 SC_DLLPUBLIC rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        const String& rFieldSeparators, sal_Unicode cFieldQuote,
+        const String& rFieldSeparators, sal_Unicode cFieldQuote );
        bool bAllowBackslashEscape = false);
 #endif