328 lines
10 KiB
C++
328 lines
10 KiB
C++
![]() |
/*************************************************************************
|
||
|
*
|
||
|
* $RCSfile: XmlIndex.cxx,v $
|
||
|
*
|
||
|
* $Revision: 1.1 $
|
||
|
*
|
||
|
* last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $
|
||
|
*
|
||
|
* The Contents of this file are made available subject to the terms of
|
||
|
* either of the following licenses
|
||
|
*
|
||
|
* - GNU Lesser General Public License Version 2.1
|
||
|
* - Sun Industry Standards Source License Version 1.1
|
||
|
*
|
||
|
* Sun Microsystems Inc., October, 2000
|
||
|
*
|
||
|
* GNU Lesser General Public License Version 2.1
|
||
|
* =============================================
|
||
|
* Copyright 2000 by Sun Microsystems, Inc.
|
||
|
* 901 San Antonio Road, Palo Alto, CA 94303, USA
|
||
|
*
|
||
|
* This library is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License version 2.1, as published by the Free Software Foundation.
|
||
|
*
|
||
|
* This library is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with this library; if not, write to the Free Software
|
||
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
||
|
* MA 02111-1307 USA
|
||
|
*
|
||
|
*
|
||
|
* Sun Industry Standards Source License Version 1.1
|
||
|
* =================================================
|
||
|
* The contents of this file are subject to the Sun Industry Standards
|
||
|
* Source License Version 1.1 (the "License"); You may not use this file
|
||
|
* except in compliance with the License. You may obtain a copy of the
|
||
|
* License at http://www.openoffice.org/license.html.
|
||
|
*
|
||
|
* Software provided under this License is provided on an "AS IS" basis,
|
||
|
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
|
||
|
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
|
||
|
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
|
||
|
* See the License for the specific provisions governing your rights and
|
||
|
* obligations concerning the Software.
|
||
|
*
|
||
|
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
|
||
|
*
|
||
|
* Copyright: 2000 by Sun Microsystems, Inc.
|
||
|
*
|
||
|
* All Rights Reserved.
|
||
|
*
|
||
|
* Contributor(s): _______________________________________
|
||
|
*
|
||
|
*
|
||
|
************************************************************************/
|
||
|
#ifndef _XMLSEARCH_QE_XMLINDEX_HXX_
|
||
|
#include <qe/XmlIndex.hxx>
|
||
|
#endif
|
||
|
#ifndef _XMLSEARCH_QE_DOCGENERATOR_HXX_
|
||
|
#include <qe/DocGenerator.hxx>
|
||
|
#endif
|
||
|
#ifndef _XMLSEARCH_UTIL_CONCEPTLIST_HXX_
|
||
|
#include <util/ConceptList.hxx>
|
||
|
#endif
|
||
|
#ifndef _XMLSEARCH_UTIL_RANDOMACCESSSTREAM_HXX_
|
||
|
#include <util/RandomAccessStream.hxx>
|
||
|
#endif
|
||
|
#ifndef _XMLSEARCH_UTIL_DECOMPRESSOR_HXX_
|
||
|
#include <util/Decompressor.hxx>
|
||
|
#endif
|
||
|
|
||
|
|
||
|
using namespace xmlsearch;
|
||
|
using namespace xmlsearch::qe;
|
||
|
|
||
|
|
||
|
// extern sal_Int32 getInteger_( const sal_Int8* );
|
||
|
|
||
|
|
||
|
XmlIndex::XmlIndex( const rtl::OUString& indexDir )
|
||
|
: indexAccessor_( indexDir ),
|
||
|
dict_( 0 ),
|
||
|
documents_( 0 ),
|
||
|
concepts_( 0 ),
|
||
|
allLists_( 0 ),
|
||
|
allListsL_( 0 ),
|
||
|
positionsL_( 0 ),
|
||
|
positions_( 0 ),
|
||
|
contextsDataL_( 0 ),
|
||
|
contextsData_( 0 ),
|
||
|
contextTables_( 0 )
|
||
|
{
|
||
|
// reading DOCS
|
||
|
{
|
||
|
allListsL_ = indexAccessor_.readByteArray( allLists_,
|
||
|
rtl::OUString::createFromAscii("DOCS") ); // reading DOCS
|
||
|
}
|
||
|
|
||
|
// reading CONTEXTS
|
||
|
{
|
||
|
contextsDataL_ = indexAccessor_.readByteArray( contextsData_,
|
||
|
rtl::OUString::createFromAscii("CONTEXTS") ); // reading CONTEXTS
|
||
|
}
|
||
|
|
||
|
// reading POSITIONS
|
||
|
{
|
||
|
positionsFile_ = indexAccessor_.getStream( rtl::OUString::createFromAscii( "POSITIONS" ),
|
||
|
rtl::OUString::createFromAscii( "r" ) );
|
||
|
|
||
|
//!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
|
||
|
if( allInCache_ = true ) // yes, intended
|
||
|
{
|
||
|
reset();
|
||
|
positions_ = new sal_Int8[ positionsL_ = positionsFile_->length() ];
|
||
|
positionsFile_->readBytes( positions_,positionsL_ );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
// reading DOCS.TAB
|
||
|
{
|
||
|
util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "DOCS.TAB" ),
|
||
|
rtl::OUString::createFromAscii( "r" ) );
|
||
|
sal_Int8 a[4];
|
||
|
a[0] = a[1] = a[2] = 0;
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k1 = ::getInteger_( a );
|
||
|
util::StreamDecompressor sddocs( in );
|
||
|
sddocs.ascDecode( k1,concepts_ );
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k2 = ::getInteger_( a );
|
||
|
offsets_.push_back( 0 );
|
||
|
util::StreamDecompressor sdoffsets( in );
|
||
|
sdoffsets.ascDecode( k2,offsets_ );
|
||
|
delete in;
|
||
|
|
||
|
// int a;
|
||
|
// for( a = 0; a < offsets_.size(); ++a )
|
||
|
// cout << "concepts_[" << a << "] = " << concepts_[a] << endl;
|
||
|
// for( a = 0; a < offsets_.size(); ++a )
|
||
|
// cout << "offsets_[" << a << "] = " << offsets_[a] << endl;
|
||
|
}
|
||
|
|
||
|
// reading OFFSETS
|
||
|
{
|
||
|
util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "OFFSETS" ),
|
||
|
rtl::OUString::createFromAscii( "r" ) );
|
||
|
sal_Int8 a[4];
|
||
|
a[0] = a[1] = a[2] = 0;
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k1 = ::getInteger_( a );
|
||
|
util::StreamDecompressor sddocs( in );
|
||
|
sddocs.decode( k1,documents_ );
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k2 = ::getInteger_( a );
|
||
|
util::StreamDecompressor sdoffsets( in );
|
||
|
sdoffsets.ascDecode( k2,microIndexOffsets_ );
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k3 = ::getInteger_( a );
|
||
|
util::StreamDecompressor sdtitles( in );
|
||
|
sdtitles.decode( k3,titles_ );
|
||
|
|
||
|
in->readBytes( &a[3],1 );
|
||
|
sal_Int32 k4 = ::getInteger_( a );
|
||
|
// contextsOffsets_ = new IntegerArray(_documents.cardinality() + 1);
|
||
|
util::StreamDecompressor co(in);
|
||
|
// _contextsOffsets.add(0); // first, trivial offset
|
||
|
co.ascDecode( k4,contextsOffsets_ );
|
||
|
|
||
|
delete in;
|
||
|
}
|
||
|
|
||
|
// Hard coding linknames ( object serialization is hard to undo )
|
||
|
{
|
||
|
linkNames_ = new rtl::OUString[ linkNamesL_ = 8 ];
|
||
|
linkNames_[0] = rtl::OUString::createFromAscii( "help:link" );
|
||
|
linkNames_[1] = rtl::OUString::createFromAscii( "help:help-text" );
|
||
|
linkNames_[2] = rtl::OUString::createFromAscii( "text:p" );
|
||
|
linkNames_[3] = rtl::OUString::createFromAscii( "text:span" );
|
||
|
linkNames_[4] = rtl::OUString::createFromAscii( "headingheading" );
|
||
|
linkNames_[5] = rtl::OUString::createFromAscii( "office:body" );
|
||
|
linkNames_[6] = rtl::OUString::createFromAscii( "help:to-be-embedded" );
|
||
|
linkNames_[7] = rtl::OUString::createFromAscii( "office:document" );
|
||
|
}
|
||
|
|
||
|
|
||
|
{
|
||
|
contextTables_ = new ContextTables( contextsOffsets_,
|
||
|
contextsDataL_,contextsData_,
|
||
|
linkNamesL_,linkNames_ );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
XmlIndex::~XmlIndex()
|
||
|
{
|
||
|
delete[] allLists_;
|
||
|
delete[] contextsData_;
|
||
|
delete[] linkNames_;
|
||
|
delete[] positions_;
|
||
|
delete positionsFile_;
|
||
|
delete contextTables_;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void XmlIndex::reset()
|
||
|
{
|
||
|
maxDocNumberInCache_ = ( allInCache_ ? ( microIndexOffsets_.size() - 1 ) : sal_Int32( -1 ) );
|
||
|
}
|
||
|
|
||
|
|
||
|
sal_Int32 binarySearch( const std::vector<sal_Int32>& arr,sal_Int32 value )
|
||
|
{
|
||
|
sal_Int32 i = 0, j = arr.size(), k;
|
||
|
while (i <= j)
|
||
|
if (arr[k = (i + j)/2] < value)
|
||
|
i = k + 1;
|
||
|
else if (value < arr[k])
|
||
|
j = k - 1;
|
||
|
else
|
||
|
return k;
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
|
||
|
NonnegativeIntegerGenerator* XmlIndex::getDocumentIterator( sal_Int32 concept )
|
||
|
{
|
||
|
// #ifdef ABIDEBUG
|
||
|
// cout << concept << endl;
|
||
|
// #endif
|
||
|
|
||
|
sal_Int32 index = binarySearch( concepts_,concept );
|
||
|
|
||
|
#ifdef ABIDEBUG
|
||
|
// cout << index << " " << allListsL_ << " " << allLists_ << endl;
|
||
|
|
||
|
// for( int i = 0; i < allListsL_; ++i )
|
||
|
// cout << "_allList[" << i << "] = " << sal_Int32( allLists_[i] ) << endl;
|
||
|
|
||
|
// for( int i = 0; i < offsets_.size(); ++i )
|
||
|
// cout << "offsets[" << i << "] = " << offsets_[i] << endl;
|
||
|
#endif
|
||
|
|
||
|
if( index >= 0 )
|
||
|
return new util::ConceptList( allLists_,allListsL_,offsets_[index] );
|
||
|
else
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
bool XmlIndex::occursInText( sal_Int32 concept )
|
||
|
{
|
||
|
return binarySearch( concepts_,concept) >= 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
sal_Int8* XmlIndex::getPositions( sal_Int32& len,sal_Int32 docNo ) throw( excep::XmlSearchException )
|
||
|
{
|
||
|
contextTables_->setMicroindex( docNo );
|
||
|
if( docNo > maxDocNumberInCache_ )
|
||
|
readMicroindexes( docNo );
|
||
|
|
||
|
len = positionsL_;
|
||
|
return positions_;
|
||
|
}
|
||
|
|
||
|
|
||
|
rtl::OUString XmlIndex::documentName( sal_Int32 docNumber ) throw( excep::XmlSearchException )
|
||
|
{
|
||
|
if( docNumber < 0 || documents_.size() <= sal_uInt32( docNumber ) )
|
||
|
{
|
||
|
rtl::OUString message = rtl::OUString::createFromAscii( "XmlIndex::documentName -> " );
|
||
|
throw excep::XmlSearchException( message );
|
||
|
}
|
||
|
|
||
|
return dict_.fetch( documents_[ docNumber ] );
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
void XmlIndex::readMicroindexes( sal_Int32 docNo ) throw( xmlsearch::excep::IOException )
|
||
|
{
|
||
|
currentBatchOffset_ = microIndexOffsets_[docNo];
|
||
|
sal_Int32 offsetLimit = currentBatchOffset_ + positionsL_;
|
||
|
sal_Int32 upTo = 0, nextDoc = docNo;
|
||
|
sal_Int32 lastOffset = 0;
|
||
|
|
||
|
do
|
||
|
{
|
||
|
if( ++nextDoc == sal_Int32( microIndexOffsets_.size() ) )
|
||
|
lastOffset = sal_Int32( positionsFile_->length() );
|
||
|
else if( microIndexOffsets_[ nextDoc ] > offsetLimit )
|
||
|
lastOffset = microIndexOffsets_[ nextDoc ];
|
||
|
}
|
||
|
while( lastOffset == 0 );
|
||
|
|
||
|
if( lastOffset > offsetLimit )
|
||
|
{
|
||
|
upTo = microIndexOffsets_[ nextDoc - 1 ];
|
||
|
maxDocNumberInCache_ = nextDoc - 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
upTo = lastOffset;
|
||
|
maxDocNumberInCache_ = nextDoc - 1;
|
||
|
}
|
||
|
|
||
|
if( maxDocNumberInCache_ < docNo )
|
||
|
{ // cache too small
|
||
|
// for current microindex
|
||
|
// System.out.println("expanding cache to " + _positionsCacheSize);
|
||
|
delete[] positions_;
|
||
|
positions_ = new sal_Int8[ positionsL_ = lastOffset - currentBatchOffset_ ];
|
||
|
readMicroindexes( docNo );
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
positionsFile_->seek( currentBatchOffset_ );
|
||
|
positionsFile_->readBytes( positions_,upTo - currentBatchOffset_ );
|
||
|
}
|