443 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			443 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*************************************************************************
 | |
|  *
 | |
|  *  $RCSfile: XmlIndex.cxx,v $
 | |
|  *
 | |
|  *  $Revision: 1.7 $
 | |
|  *
 | |
|  *  last change: $Author: abi $ $Date: 2001-07-06 10:03:26 $
 | |
|  *
 | |
|  *  The Contents of this file are made available subject to the terms of
 | |
|  *  either of the following licenses
 | |
|  *
 | |
|  *         - GNU Lesser General Public License Version 2.1
 | |
|  *         - Sun Industry Standards Source License Version 1.1
 | |
|  *
 | |
|  *  Sun Microsystems Inc., October, 2000
 | |
|  *
 | |
|  *  GNU Lesser General Public License Version 2.1
 | |
|  *  =============================================
 | |
|  *  Copyright 2000 by Sun Microsystems, Inc.
 | |
|  *  901 San Antonio Road, Palo Alto, CA 94303, USA
 | |
|  *
 | |
|  *  This library is free software; you can redistribute it and/or
 | |
|  *  modify it under the terms of the GNU Lesser General Public
 | |
|  *  License version 2.1, as published by the Free Software Foundation.
 | |
|  *
 | |
|  *  This library is distributed in the hope that it will be useful,
 | |
|  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  *  Lesser General Public License for more details.
 | |
|  *
 | |
|  *  You should have received a copy of the GNU Lesser General Public
 | |
|  *  License along with this library; if not, write to the Free Software
 | |
|  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 | |
|  *  MA  02111-1307  USA
 | |
|  *
 | |
|  *
 | |
|  *  Sun Industry Standards Source License Version 1.1
 | |
|  *  =================================================
 | |
|  *  The contents of this file are subject to the Sun Industry Standards
 | |
|  *  Source License Version 1.1 (the "License"); You may not use this file
 | |
|  *  except in compliance with the License. You may obtain a copy of the
 | |
|  *  License at http://www.openoffice.org/license.html.
 | |
|  *
 | |
|  *  Software provided under this License is provided on an "AS IS" basis,
 | |
|  *  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
 | |
|  *  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
 | |
|  *  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
 | |
|  *  See the License for the specific provisions governing your rights and
 | |
|  *  obligations concerning the Software.
 | |
|  *
 | |
|  *  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
 | |
|  *
 | |
|  *  Copyright: 2000 by Sun Microsystems, Inc.
 | |
|  *
 | |
|  *  All Rights Reserved.
 | |
|  *
 | |
|  *  Contributor(s): _______________________________________
 | |
|  *
 | |
|  *
 | |
|  ************************************************************************/
 | |
| #ifndef _OSL_DIAGNOSE_H_
 | |
| #include <osl/diagnose.h>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_QE_XMLINDEX_HXX_
 | |
| #include <qe/XmlIndex.hxx>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_QE_DOCGENERATOR_HXX_
 | |
| #include <qe/DocGenerator.hxx>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_UTIL_CONCEPTLIST_HXX_
 | |
| #include <util/ConceptList.hxx>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_UTIL_RANDOMACCESSSTREAM_HXX_
 | |
| #include <util/RandomAccessStream.hxx>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_UTIL_DECOMPRESSOR_HXX_
 | |
| #include <util/Decompressor.hxx>
 | |
| #endif
 | |
| #ifndef _XMLSEARCH_QE_QUERY_HXX_
 | |
| #include <qe/Query.hxx>
 | |
| #endif
 | |
| 
 | |
| using namespace xmlsearch;
 | |
| using namespace xmlsearch::excep;
 | |
| using namespace xmlsearch::qe;
 | |
| 
 | |
| 
 | |
| // extern sal_Int32 getInteger_( const sal_Int8* );
 | |
| 
 | |
| 
 | |
| XmlIndex::XmlIndex( const rtl::OUString& indexDir )
 | |
|     throw( IOException )
 | |
|     : indexAccessor_( indexDir ),
 | |
|       dict_( indexAccessor_ ),
 | |
|       documents_( 0 ),
 | |
|       concepts_( 0 ),
 | |
|       allLists_( 0 ),
 | |
|       allListsL_( 0 ),
 | |
|       positionsL_( 0 ),
 | |
|       positions_( 0 ),
 | |
|       contextsDataL_( 0 ),
 | |
|       contextsData_( 0 ),
 | |
|       contextTables_( 0 ),
 | |
|       currentBatchOffset_( 0 ),
 | |
|       maxDocNumberInCache_( -1 )
 | |
| {
 | |
|     // reading DOCS
 | |
|     try
 | |
|     {
 | |
|         allListsL_ = indexAccessor_.readByteArray( allLists_,
 | |
|                                                    rtl::OUString::createFromAscii("DOCS") ); // reading DOCS
 | |
|     }
 | |
|     catch( IOException )
 | |
|     {
 | |
|         OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open DOCS/docs" );
 | |
|         throw;
 | |
|     }
 | |
| 
 | |
|     // reading CONTEXTS
 | |
|     try
 | |
|     {
 | |
|         contextsDataL_ = indexAccessor_.readByteArray( contextsData_,
 | |
|                                                        rtl::OUString::createFromAscii("CONTEXTS") ); // reading CONTEXTS
 | |
|     }
 | |
|     catch( IOException )
 | |
|     {
 | |
|         OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open CONTEXTS/contexts" );
 | |
|         delete[] allLists_;
 | |
|         throw;
 | |
|     }
 | |
| 
 | |
|     // reading POSITIONS
 | |
|     {
 | |
|         positionsFile_ = indexAccessor_.getStream( rtl::OUString::createFromAscii( "POSITIONS" ),
 | |
|                                                    rtl::OUString::createFromAscii( "r" ) );
 | |
| 
 | |
|         OSL_ENSURE( positionsFile_ != 0, "XmlIndex::XmlIndex -> cannot open POSITIONS/positions" );
 | |
| 
 | |
|         if( positionsFile_ )
 | |
|         {
 | |
|             //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
 | |
|             if( allInCache_ = true )   // yes, intended
 | |
|             {
 | |
|                 reset();
 | |
|                 positions_ = new sal_Int8[ positionsL_ = positionsFile_->length() ];
 | |
|                 positionsFile_->readBytes( positions_,positionsL_ );
 | |
|             }
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             delete[] allLists_;
 | |
|             delete[] contextsData_;
 | |
|             throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no POSITIONS/positions") );
 | |
|         }
 | |
|     }
 | |
| 
 | |
| 
 | |
|     // reading DOCS.TAB
 | |
|     {
 | |
|         util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "DOCS.TAB" ),
 | |
|                                                                  rtl::OUString::createFromAscii( "r" ) );
 | |
| 
 | |
|         if( in )
 | |
|         {
 | |
|             sal_Int8 a[4];
 | |
|             a[0] = a[1] = a[2] = 0;
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k1 = ::getInteger_( a );
 | |
|             util::StreamDecompressor sddocs( in );
 | |
|             sddocs.ascDecode( k1,concepts_ );
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k2 = ::getInteger_( a );
 | |
|             offsets_.push_back( 0 );
 | |
|             util::StreamDecompressor sdoffsets( in );
 | |
|             sdoffsets.ascDecode( k2,offsets_ );
 | |
|             delete in;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             delete[] allLists_;
 | |
|             delete[] contextsData_;
 | |
|             delete[] positions_;
 | |
|             delete positionsFile_;
 | |
|             throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no DOCS.TAB/docs.tab") );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // reading OFFSETS
 | |
|     {
 | |
|         util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "OFFSETS" ),
 | |
|                                                                  rtl::OUString::createFromAscii( "r" ) );
 | |
|         if( in )
 | |
|         {
 | |
|             sal_Int8 a[4];
 | |
|             a[0] = a[1] = a[2] = 0;
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k1 = ::getInteger_( a );
 | |
|             util::StreamDecompressor sddocs( in );
 | |
|             sddocs.decode( k1,documents_ );
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k2 = ::getInteger_( a );
 | |
|             util::StreamDecompressor sdoffsets( in );
 | |
|             sdoffsets.ascDecode( k2,microIndexOffsets_ );
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k3 = ::getInteger_( a );
 | |
|             util::StreamDecompressor sdtitles( in );
 | |
|             sdtitles.decode( k3,titles_ );
 | |
| 
 | |
|             in->readBytes( &a[3],1 );
 | |
|             sal_Int32 k4 = ::getInteger_( a );
 | |
|             //     contextsOffsets_ = new IntegerArray(_documents.cardinality() + 1);
 | |
|             util::StreamDecompressor co(in);
 | |
|             //    _contextsOffsets.add(0);  // first, trivial offset
 | |
|             co.ascDecode( k4,contextsOffsets_ );
 | |
|             delete in;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             delete[] allLists_;
 | |
|             delete[] contextsData_;
 | |
|             delete[] positions_;
 | |
|             delete positionsFile_;
 | |
|             throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no OFFSETS/offsets") );
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // reading linknames
 | |
|     {
 | |
|         util::RandomAccessStream* in =
 | |
|             indexAccessor_.getStream( rtl::OUString::createFromAscii( "LINKNAMES" ),
 | |
|                                       rtl::OUString::createFromAscii( "r" ) );
 | |
|         if( ! in )
 | |
|         {
 | |
|             delete[] allLists_;
 | |
|             delete[] contextsData_;
 | |
|             delete[] positions_;
 | |
|             delete positionsFile_;
 | |
|             throw IOException(
 | |
|                 rtl::OUString::createFromAscii( "BtreeDict::BtreeDict -> no LINKNAMES/linknames" ) );
 | |
|         }
 | |
| 
 | |
|         sal_Int32 len = in->length();
 | |
|         char* bff = new char[ 1 + len ], *bff1 = new char[ 1 + len ];
 | |
|         bff[ len ] = 0;
 | |
|         in->readBytes( reinterpret_cast<sal_Int8*>( bff ),len );
 | |
|         delete in;
 | |
| 
 | |
|         // Now the buffer must be densified.
 | |
|         int i,len1 = 0;
 | |
|         for( i = 0; i < len; ++i )
 | |
|         {
 | |
|             if( bff[i] )
 | |
|                 bff1[ len1++ ] = bff[i];
 | |
|         }
 | |
|         bff1[len] = 0;
 | |
|         delete[] bff;
 | |
|         rtl::OString aStr( bff1 );  // build a string from the densified buffer;
 | |
|         delete[] bff1;
 | |
| 
 | |
|         // Now determine the order
 | |
|         rtl::OString LN[8];
 | |
|         LN[0] = "text:span";
 | |
|         LN[1] = "help:help-text";
 | |
|         LN[2] = "help:to-be-embedded";
 | |
|         LN[3] = "headingheading";
 | |
|         LN[4] = "office:body";
 | |
|         LN[5] = "text:p";
 | |
|         LN[6] = "office:document";
 | |
|         LN[7] = "help:link";
 | |
| 
 | |
|         int idx[8];
 | |
|         idx[0] = aStr.indexOf( LN[0] );
 | |
|         idx[1] = aStr.indexOf( LN[1] );
 | |
|         idx[2] = aStr.indexOf( LN[2] );
 | |
|         idx[3] = aStr.indexOf( LN[3] );
 | |
|         idx[4] = aStr.indexOf( LN[4] );
 | |
|         idx[5] = aStr.indexOf( LN[5] );
 | |
|         idx[6] = aStr.indexOf( LN[6] );
 | |
|         idx[7] = aStr.indexOf( LN[7] );
 | |
| 
 | |
|         linkNames_ = new rtl::OUString[ linkNamesL_ = 8 ];
 | |
| 
 | |
|         for( i = 0; i < 8; ++i )
 | |
|         {
 | |
|             int first;
 | |
|             int Place = 100000;
 | |
|             for( int j = 0; j < 8; ++j )
 | |
|             {
 | |
|                 if( idx[j] < Place )
 | |
|                     Place = idx[ first = j];
 | |
|             }
 | |
|             idx[first] = 100000;
 | |
|             linkNames_[i] = rtl::OUString( LN[first].getStr(),LN[first].getLength(),RTL_TEXTENCODING_UTF8 );
 | |
|         }
 | |
|     }  // end linknames
 | |
| 
 | |
| 
 | |
|     {
 | |
|         contextTables_ = new ContextTables( contextsOffsets_,
 | |
|                                             contextsDataL_,contextsData_,
 | |
|                                             linkNamesL_,linkNames_ );
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| XmlIndex::~XmlIndex()
 | |
| {
 | |
|     delete[] allLists_;
 | |
|     delete[] contextsData_;
 | |
|     delete[] linkNames_;
 | |
|     delete[] positions_;
 | |
|     delete positionsFile_;
 | |
|     delete contextTables_;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| void XmlIndex::reset()
 | |
| {
 | |
|     maxDocNumberInCache_ = ( allInCache_ ? ( microIndexOffsets_.size() - 1 ) : sal_Int32( -1 ) );
 | |
| }
 | |
| 
 | |
| 
 | |
| sal_Int32 binarySearch( const std::vector<sal_Int32>& arr,sal_Int32 value )
 | |
| {
 | |
|     sal_Int32 i = 0, j = arr.size(), k;
 | |
|     while (i <= j)
 | |
|         if (arr[k = (i + j)/2] < value)
 | |
|             i = k + 1;
 | |
|         else if (value < arr[k])
 | |
|             j = k - 1;
 | |
|         else
 | |
|             return k;
 | |
|     return -1;
 | |
| }
 | |
| 
 | |
| 
 | |
| NonnegativeIntegerGenerator* XmlIndex::getDocumentIterator( sal_Int32 concept )
 | |
| {
 | |
|     sal_Int32 index = binarySearch( concepts_,concept );
 | |
| 
 | |
|     if( index >= 0 )
 | |
|         return new util::ConceptList( allLists_,allListsL_,offsets_[index] );
 | |
|     else
 | |
|         return 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| bool XmlIndex::occursInText( sal_Int32 concept )
 | |
| {
 | |
|     return binarySearch( concepts_,concept) >= 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| sal_Int8* XmlIndex::getPositions( sal_Int32& len,sal_Int32 docNo ) throw( excep::XmlSearchException )
 | |
| {
 | |
|     contextTables_->setMicroindex( docNo );
 | |
|     if( docNo > maxDocNumberInCache_ )
 | |
|         readMicroindexes( docNo );
 | |
| 
 | |
|     len = positionsL_;
 | |
|     return positions_;
 | |
| }
 | |
| 
 | |
| 
 | |
| rtl::OUString XmlIndex::documentName( sal_Int32 docNumber ) throw( excep::XmlSearchException )
 | |
| {
 | |
|     if( docNumber < 0 || documents_.size() <= sal_uInt32( docNumber ) )
 | |
|     {
 | |
|         rtl::OUString message = rtl::OUString::createFromAscii( "XmlIndex::documentName -> " );
 | |
|         throw excep::XmlSearchException( message );
 | |
|     }
 | |
| 
 | |
|     return dict_.fetch( documents_[ docNumber ] );
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| void XmlIndex::readMicroindexes( sal_Int32 docNo ) throw( xmlsearch::excep::IOException )
 | |
| {
 | |
|     currentBatchOffset_ = microIndexOffsets_[docNo];
 | |
|     sal_Int32 offsetLimit = currentBatchOffset_ + positionsL_;
 | |
|     sal_Int32 upTo = 0, nextDoc = docNo;
 | |
|     sal_Int32 lastOffset = 0;
 | |
| 
 | |
|     do
 | |
|     {
 | |
|         if( ++nextDoc == sal_Int32( microIndexOffsets_.size() ) )
 | |
|             lastOffset = sal_Int32( positionsFile_->length() );
 | |
|         else if( microIndexOffsets_[ nextDoc ] > offsetLimit )
 | |
|             lastOffset = microIndexOffsets_[ nextDoc ];
 | |
|     }
 | |
|     while( lastOffset == 0 );
 | |
| 
 | |
|     if( lastOffset > offsetLimit )
 | |
|     {
 | |
|         upTo = microIndexOffsets_[ nextDoc - 1 ];
 | |
|         maxDocNumberInCache_ = nextDoc - 2;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         upTo = lastOffset;
 | |
|         maxDocNumberInCache_ = nextDoc - 1;
 | |
|     }
 | |
| 
 | |
|     if( maxDocNumberInCache_ < docNo )
 | |
|     { // cache too small
 | |
|         // for current microindex
 | |
|         // System.out.println("expanding cache to " + _positionsCacheSize);
 | |
|         delete[] positions_;
 | |
|         positions_ = new sal_Int8[ positionsL_ = lastOffset - currentBatchOffset_ ];
 | |
|         readMicroindexes( docNo );
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     positionsFile_->seek( currentBatchOffset_ );
 | |
|     positionsFile_->readBytes( positions_,upTo - currentBatchOffset_ );
 | |
| }
 | |
| 
 | |
| 
 | |
| QueryHitData* XmlIndex::hitToData( QueryHit* hit )
 | |
| {
 | |
|     sal_Int32 termsL,matchesL;
 | |
|     sal_Int32 *matches = hit->getMatches( matchesL );
 | |
|     rtl::OUString *terms = new rtl::OUString[ termsL = matchesL >>/*>*/ 1 ];
 | |
|     for( sal_Int32 i = 0; i < termsL; ++i )
 | |
|     {
 | |
|         sal_Int32 aInt = ( i << 1 );
 | |
|         if( matches[ aInt ] > 0 )
 | |
|             terms[i] = fetch( matches[ aInt ] );
 | |
|     }
 | |
|     sal_Int32 document = hit->getDocument();
 | |
|     QueryHitData *res = new QueryHitData( hit->getPenalty(),
 | |
|                                           documentName( document ),
 | |
|                                           termsL,terms );
 | |
|     contextTables_->setMicroindex( document );
 | |
|     contextTables_->resetContextSearch();
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| 
 |