libreoffice/xmlhelp/source/cxxhelp/qe/XmlIndex.cxx

/*************************************************************************
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * Copyright 2008 by Sun Microsystems, Inc.
 *
 * OpenOffice.org - a multi-platform office productivity suite
 *
 * $RCSfile: XmlIndex.cxx,v $
 * $Revision: 1.19 $
 *
 * This file is part of OpenOffice.org.
 *
 * OpenOffice.org is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License version 3
 * only, as published by the Free Software Foundation.
 *
 * OpenOffice.org is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License version 3 for more details
 * (a copy is included in the LICENSE file that accompanied this code).
 *
 * You should have received a copy of the GNU Lesser General Public License
 * version 3 along with OpenOffice.org.  If not, see
 * <http://www.openoffice.org/license.html>
 * for a copy of the LGPLv3 License.
 *
 ************************************************************************/

// MARKER(update_precomp.py): autogen include statement, do not remove
#include "precompiled_xmlhelp.hxx"
#include <osl/diagnose.h>
#include <qe/XmlIndex.hxx>
#include <qe/DocGenerator.hxx>
#include <util/ConceptList.hxx>
#ifndef _XMLSEARCH_UTIL_RANDOMACCESSSTREAM_HXX_
#include <util/RandomAccessStream.hxx>
#endif
#include <util/Decompressor.hxx>
#include <qe/Query.hxx>

using namespace xmlsearch;
using namespace xmlsearch::excep;
using namespace xmlsearch::qe;


// extern sal_Int32 getInteger_( const sal_Int8* );


XmlIndex::XmlIndex( const rtl::OUString& indexDir )
    throw( IOException )
    : currentBatchOffset_( 0 ),
      maxDocNumberInCache_( -1 ),
      indexAccessor_( indexDir ),
      dict_( indexAccessor_ ),
      contextTables_( 0 ),
      allListsL_( 0 ),
      allLists_( 0 ),
      positionsL_( 0 ),
      positions_( 0 ),
      contextsDataL_( 0 ),
      contextsData_( 0 ),
      concepts_( 0 ),
      documents_( 0 )
{
    // reading DOCS
    try
    {
        allListsL_ = indexAccessor_.readByteArray( allLists_,
                                                   rtl::OUString::createFromAscii("DOCS") ); // reading DOCS
    }
    catch( IOException )
    {
        OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open DOCS/docs" );
        throw;
    }

    // reading CONTEXTS
    try
    {
        contextsDataL_ = indexAccessor_.readByteArray( contextsData_,
                                                       rtl::OUString::createFromAscii("CONTEXTS") ); // reading CONTEXTS
    }
    catch( IOException )
    {
        OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open CONTEXTS/contexts" );
        delete[] allLists_;
        throw;
    }

    // reading POSITIONS
    {
        positionsFile_ = indexAccessor_.getStream( rtl::OUString::createFromAscii( "POSITIONS" ),
                                                   rtl::OUString::createFromAscii( "r" ) );

        OSL_ENSURE( positionsFile_ != 0, "XmlIndex::XmlIndex -> cannot open POSITIONS/positions" );

        if( positionsFile_ )
        {
            //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
            allInCache_ = true;
            if( allInCache_ )   // yes, intended
            {
                reset();
                positions_ = new sal_Int8[ positionsL_ = positionsFile_->length() ];
                positionsFile_->readBytes( positions_,positionsL_ );
            }
        }
        else
        {
            delete[] allLists_;
            delete[] contextsData_;
            throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no POSITIONS/positions") );
        }
    }


    // reading DOCS.TAB
    {
        util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "DOCS.TAB" ),
                                                                 rtl::OUString::createFromAscii( "r" ) );

        if( in )
        {
            sal_Int8 a[4];
            a[0] = a[1] = a[2] = 0;
            in->readBytes( &a[3],1 );
            sal_Int32 k1 = ::getInteger_( a );
            util::StreamDecompressor sddocs( in );
            sddocs.ascDecode( k1,concepts_ );
            in->readBytes( &a[3],1 );
            sal_Int32 k2 = ::getInteger_( a );
            offsets_.push_back( 0 );
            util::StreamDecompressor sdoffsets( in );
            sdoffsets.ascDecode( k2,offsets_ );
            delete in;
        }
        else
        {
            delete[] allLists_;
            delete[] contextsData_;
            delete[] positions_;
            delete positionsFile_;
            throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no DOCS.TAB/docs.tab") );
        }
    }

    // reading OFFSETS
    {
        util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "OFFSETS" ),
                                                                 rtl::OUString::createFromAscii( "r" ) );
        if( in )
        {
            sal_Int8 a[4];
            a[0] = a[1] = a[2] = 0;
            in->readBytes( &a[3],1 );
            sal_Int32 k1 = ::getInteger_( a );
            util::StreamDecompressor sddocs( in );
            sddocs.decode( k1,documents_ );
            in->readBytes( &a[3],1 );
            sal_Int32 k2 = ::getInteger_( a );
            util::StreamDecompressor sdoffsets( in );
            sdoffsets.ascDecode( k2,microIndexOffsets_ );
            in->readBytes( &a[3],1 );
            sal_Int32 k3 = ::getInteger_( a );
            util::StreamDecompressor sdtitles( in );
            sdtitles.decode( k3,titles_ );

            in->readBytes( &a[3],1 );
            sal_Int32 k4 = ::getInteger_( a );
            //     contextsOffsets_ = new IntegerArray(_documents.cardinality() + 1);
            util::StreamDecompressor co(in);
            //    _contextsOffsets.add(0);  // first, trivial offset
            co.ascDecode( k4,contextsOffsets_ );
            delete in;
        }
        else
        {
            delete[] allLists_;
            delete[] contextsData_;
            delete[] positions_;
            delete positionsFile_;
            throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no OFFSETS/offsets") );
        }
    }

    // reading linknames
    {
        util::RandomAccessStream* in =
            indexAccessor_.getStream( rtl::OUString::createFromAscii( "LINKNAMES" ),
                                      rtl::OUString::createFromAscii( "r" ) );
        if( ! in )
        {
            delete[] allLists_;
            delete[] contextsData_;
            delete[] positions_;
            delete positionsFile_;
            throw IOException(
                rtl::OUString::createFromAscii( "BtreeDict::BtreeDict -> no LINKNAMES/linknames" ) );
        }

        sal_Int32 len = in->length();
        char* bff = new char[ 1 + len ], *bff1 = new char[ 1 + len ];
        bff[ len ] = 0;
        in->readBytes( reinterpret_cast<sal_Int8*>( bff ),len );
        delete in;

        // Now the buffer must be densified.
        int i,len1 = 0;
        for( i = 0; i < len; ++i )
        {
            if( bff[i] )
                bff1[ len1++ ] = bff[i];
        }
        bff1[len1] = 0;
        delete[] bff;
        rtl::OString aStr( bff1 );  // build a string from the densified buffer;
        delete[] bff1;

//      // Now determine the order
// #define NAMECOUNT 16
// #define UNREACHABLEPLACE 100000;
//      /**
//       *  The available names cannot be determined from LINKNAMES at current,
//       *  because LINKNAMES is a serialized Java-object
//       *  Always update LINKNAMES if index.xsl or default.xsl are modified.
//       */
//      rtl::OString LN[NAMECOUNT];
//      LN[0] = "text:span";
//      LN[1] = "help:help-text";
//      LN[2] = "help:to-be-embedded";
//      LN[3] = "headingheading";
//      LN[4] = "office:body";
//      LN[5] = "text:p";
//      LN[6] = "office:document";
//      LN[7] = "help:link";
//      LN[8] = "help:key-word";
//      LN[9] = "table:table";
//      LN[10] = "table:table-header-row";
//      LN[11] = "table:table-row";
//      LN[12] = "table:table-cell";
//      LN[13] = "text:unordered-list";
//      LN[14] = "text:ordered-list";
//      LN[15] = "text:list-item";
        // Now determine the order

#define NAMECOUNT 16
#define UNREACHABLEPLACE 100000;
        /**
         *  The available names cannot be determined from LINKNAMES at current,
         *  because LINKNAMES is a serialized Java-object
         *  Always update LINKNAMES if index.xsl or default.xsl are modified.
         */

        rtl::OString LN[NAMECOUNT];
        LN[0] = "helpdocument";
        LN[1] = "body";
        LN[2] = "title";
        LN[3] = "table";
        LN[4] = "tablecell";
        LN[5] = "tablerow";
        LN[6] = "list";
        LN[7] = "listitem";
        LN[8] = "item";
        LN[9] = "emph";
        LN[10] = "paragraph";
        LN[11] = "section";
        LN[12] = "bookmark";
        LN[13] = "bookmark_value";
        LN[14] = "ahelp";
        LN[15] = "link";

        // Determine index in file
        int idx[NAMECOUNT];
        /*int*/ linkNamesL_ = NAMECOUNT;
        for( i = 0; i < NAMECOUNT; ++i )
            if( ( idx[i] = aStr.indexOf( LN[i] ) ) == -1 ) {
                idx[i] = UNREACHABLEPLACE;
                --linkNamesL_;
            }

        linkNames_ = new rtl::OUString[linkNamesL_];
        for( i = 0; i < linkNamesL_; ++i ) {
            // TODO what happens to first if we never hit Place?
            int first = 0;
            int Place = UNREACHABLEPLACE; // This is the defintely last place
            for( int j = 0; j < NAMECOUNT; ++j )
            {
                if( idx[j] < Place )
                    Place = idx[first = j];
            }
            idx[first] = UNREACHABLEPLACE;
            linkNames_[i] = rtl::OUString( LN[first].getStr(),LN[first].getLength(),RTL_TEXTENCODING_UTF8 );
        }

#undef NAMECOUNT
#undef UNREACHABLEPLACE
    }  // end linknames


    {
        contextTables_ = new ContextTables(contextsOffsets_,
                                           contextsDataL_,contextsData_,
                                           linkNamesL_,linkNames_ );
    }
}


XmlIndex::~XmlIndex()
{
    delete[] allLists_;
    delete[] contextsData_;
    delete[] linkNames_;
    delete[] positions_;
    delete positionsFile_;
    delete contextTables_;
}


void XmlIndex::reset()
{
    maxDocNumberInCache_ = allInCache_ ? static_cast<sal_Int32>(microIndexOffsets_.size()) - 1 : -1;
}


sal_Int32 binarySearch( const std::vector<sal_Int32>& arr,sal_Int32 value )
{
    sal_Int32 i = 0, j = arr.size(), k;
    while (i <= j)
        if (arr[k = (i + j)/2] < value)
            i = k + 1;
        else if (value < arr[k])
            j = k - 1;
        else
            return k;
    return -1;
}


NonnegativeIntegerGenerator* XmlIndex::getDocumentIterator( sal_Int32 concept )
{
    sal_Int32 index = binarySearch( concepts_,concept );

    if( index >= 0 )
        return new util::ConceptList( allLists_,allListsL_,offsets_[index] );
    else
        return 0;
}


bool XmlIndex::occursInText( sal_Int32 concept )
{
    return binarySearch( concepts_,concept) >= 0;
}


sal_Int8* XmlIndex::getPositions( sal_Int32& len,sal_Int32 docNo ) throw( excep::XmlSearchException )
{
    contextTables_->setMicroindex( docNo );
    if( docNo > maxDocNumberInCache_ )
        readMicroindexes( docNo );

    len = positionsL_;
    return positions_;
}


rtl::OUString XmlIndex::documentName( sal_Int32 docNumber ) throw( excep::XmlSearchException )
{
    if( docNumber < 0 || documents_.size() <= sal_uInt32( docNumber ) )
    {
        rtl::OUString message = rtl::OUString::createFromAscii( "XmlIndex::documentName -> " );
        throw excep::XmlSearchException( message );
    }

    return dict_.fetch( documents_[ docNumber ] );
}


void XmlIndex::readMicroindexes( sal_Int32 docNo ) throw( xmlsearch::excep::IOException )
{
    currentBatchOffset_ = microIndexOffsets_[docNo];
    sal_Int32 offsetLimit = currentBatchOffset_ + positionsL_;
    sal_Int32 upTo = 0, nextDoc = docNo;
    sal_Int32 lastOffset = 0;

    do
    {
        if( ++nextDoc == sal_Int32( microIndexOffsets_.size() ) )
            lastOffset = sal_Int32( positionsFile_->length() );
        else if( microIndexOffsets_[ nextDoc ] > offsetLimit )
            lastOffset = microIndexOffsets_[ nextDoc ];
    }
    while( lastOffset == 0 );

    if( lastOffset > offsetLimit )
    {
        upTo = microIndexOffsets_[ nextDoc - 1 ];
        maxDocNumberInCache_ = nextDoc - 2;
    }
    else
    {
        upTo = lastOffset;
        maxDocNumberInCache_ = nextDoc - 1;
    }

    if( maxDocNumberInCache_ < docNo )
    { // cache too small
        // for current microindex
        // System.out.println("expanding cache to " + _positionsCacheSize);
        delete[] positions_;
        positions_ = new sal_Int8[ positionsL_ = lastOffset - currentBatchOffset_ ];
        readMicroindexes( docNo );
        return;
    }

    positionsFile_->seek( currentBatchOffset_ );
    positionsFile_->readBytes( positions_,upTo - currentBatchOffset_ );
}


QueryHitData* XmlIndex::hitToData( QueryHit* hit )
{
    sal_Int32 termsL,matchesL;
    sal_Int32 *matches = hit->getMatches( matchesL );
    rtl::OUString *terms = new rtl::OUString[ termsL = matchesL >>/*>*/ 1 ];
    for( sal_Int32 i = 0; i < termsL; ++i )
    {
        sal_Int32 aInt = ( i << 1 );
        if( 0 <= aInt && aInt < matchesL )
        {
            sal_Int32 match = matches[ aInt ];
            if( match > 0 )
                try
                {
                    terms[i] = fetch( match );
                }
                catch( const excep::XmlSearchException& )
                {
                }
        }
    }

    sal_Int32 document = hit->getDocument();
    QueryHitData *res = new QueryHitData( hit->getPenalty(),
                                          documentName( document ),
                                          termsL,terms );
    contextTables_->setMicroindex( document );
    contextTables_->resetContextSearch();
    return res;
}