2008/04/01 16:07:45 thb 1.18.30.3: #i85898# Stripping all external header guards 2008/04/01 13:03:00 thb 1.18.30.2: #i85898# Stripping all external header guards 2008/03/31 13:04:26 rt 1.18.30.1: #i87441# Change license header to LPGL v3.
458 lines
14 KiB
C++
458 lines
14 KiB
C++
/*************************************************************************
|
|
*
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* Copyright 2008 by Sun Microsystems, Inc.
|
|
*
|
|
* OpenOffice.org - a multi-platform office productivity suite
|
|
*
|
|
* $RCSfile: XmlIndex.cxx,v $
|
|
* $Revision: 1.19 $
|
|
*
|
|
* This file is part of OpenOffice.org.
|
|
*
|
|
* OpenOffice.org is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License version 3
|
|
* only, as published by the Free Software Foundation.
|
|
*
|
|
* OpenOffice.org is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License version 3 for more details
|
|
* (a copy is included in the LICENSE file that accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* version 3 along with OpenOffice.org. If not, see
|
|
* <http://www.openoffice.org/license.html>
|
|
* for a copy of the LGPLv3 License.
|
|
*
|
|
************************************************************************/
|
|
|
|
// MARKER(update_precomp.py): autogen include statement, do not remove
|
|
#include "precompiled_xmlhelp.hxx"
|
|
#include <osl/diagnose.h>
|
|
#include <qe/XmlIndex.hxx>
|
|
#include <qe/DocGenerator.hxx>
|
|
#include <util/ConceptList.hxx>
|
|
#ifndef _XMLSEARCH_UTIL_RANDOMACCESSSTREAM_HXX_
|
|
#include <util/RandomAccessStream.hxx>
|
|
#endif
|
|
#include <util/Decompressor.hxx>
|
|
#include <qe/Query.hxx>
|
|
|
|
using namespace xmlsearch;
|
|
using namespace xmlsearch::excep;
|
|
using namespace xmlsearch::qe;
|
|
|
|
|
|
// extern sal_Int32 getInteger_( const sal_Int8* );
|
|
|
|
|
|
XmlIndex::XmlIndex( const rtl::OUString& indexDir )
|
|
throw( IOException )
|
|
: currentBatchOffset_( 0 ),
|
|
maxDocNumberInCache_( -1 ),
|
|
indexAccessor_( indexDir ),
|
|
dict_( indexAccessor_ ),
|
|
contextTables_( 0 ),
|
|
allListsL_( 0 ),
|
|
allLists_( 0 ),
|
|
positionsL_( 0 ),
|
|
positions_( 0 ),
|
|
contextsDataL_( 0 ),
|
|
contextsData_( 0 ),
|
|
concepts_( 0 ),
|
|
documents_( 0 )
|
|
{
|
|
// reading DOCS
|
|
try
|
|
{
|
|
allListsL_ = indexAccessor_.readByteArray( allLists_,
|
|
rtl::OUString::createFromAscii("DOCS") ); // reading DOCS
|
|
}
|
|
catch( IOException )
|
|
{
|
|
OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open DOCS/docs" );
|
|
throw;
|
|
}
|
|
|
|
// reading CONTEXTS
|
|
try
|
|
{
|
|
contextsDataL_ = indexAccessor_.readByteArray( contextsData_,
|
|
rtl::OUString::createFromAscii("CONTEXTS") ); // reading CONTEXTS
|
|
}
|
|
catch( IOException )
|
|
{
|
|
OSL_ENSURE( allLists_ != 0, "XmlIndex::XmlIndex -> cannot open CONTEXTS/contexts" );
|
|
delete[] allLists_;
|
|
throw;
|
|
}
|
|
|
|
// reading POSITIONS
|
|
{
|
|
positionsFile_ = indexAccessor_.getStream( rtl::OUString::createFromAscii( "POSITIONS" ),
|
|
rtl::OUString::createFromAscii( "r" ) );
|
|
|
|
OSL_ENSURE( positionsFile_ != 0, "XmlIndex::XmlIndex -> cannot open POSITIONS/positions" );
|
|
|
|
if( positionsFile_ )
|
|
{
|
|
//!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
|
|
allInCache_ = true;
|
|
if( allInCache_ ) // yes, intended
|
|
{
|
|
reset();
|
|
positions_ = new sal_Int8[ positionsL_ = positionsFile_->length() ];
|
|
positionsFile_->readBytes( positions_,positionsL_ );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
delete[] allLists_;
|
|
delete[] contextsData_;
|
|
throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no POSITIONS/positions") );
|
|
}
|
|
}
|
|
|
|
|
|
// reading DOCS.TAB
|
|
{
|
|
util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "DOCS.TAB" ),
|
|
rtl::OUString::createFromAscii( "r" ) );
|
|
|
|
if( in )
|
|
{
|
|
sal_Int8 a[4];
|
|
a[0] = a[1] = a[2] = 0;
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k1 = ::getInteger_( a );
|
|
util::StreamDecompressor sddocs( in );
|
|
sddocs.ascDecode( k1,concepts_ );
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k2 = ::getInteger_( a );
|
|
offsets_.push_back( 0 );
|
|
util::StreamDecompressor sdoffsets( in );
|
|
sdoffsets.ascDecode( k2,offsets_ );
|
|
delete in;
|
|
}
|
|
else
|
|
{
|
|
delete[] allLists_;
|
|
delete[] contextsData_;
|
|
delete[] positions_;
|
|
delete positionsFile_;
|
|
throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no DOCS.TAB/docs.tab") );
|
|
}
|
|
}
|
|
|
|
// reading OFFSETS
|
|
{
|
|
util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "OFFSETS" ),
|
|
rtl::OUString::createFromAscii( "r" ) );
|
|
if( in )
|
|
{
|
|
sal_Int8 a[4];
|
|
a[0] = a[1] = a[2] = 0;
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k1 = ::getInteger_( a );
|
|
util::StreamDecompressor sddocs( in );
|
|
sddocs.decode( k1,documents_ );
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k2 = ::getInteger_( a );
|
|
util::StreamDecompressor sdoffsets( in );
|
|
sdoffsets.ascDecode( k2,microIndexOffsets_ );
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k3 = ::getInteger_( a );
|
|
util::StreamDecompressor sdtitles( in );
|
|
sdtitles.decode( k3,titles_ );
|
|
|
|
in->readBytes( &a[3],1 );
|
|
sal_Int32 k4 = ::getInteger_( a );
|
|
// contextsOffsets_ = new IntegerArray(_documents.cardinality() + 1);
|
|
util::StreamDecompressor co(in);
|
|
// _contextsOffsets.add(0); // first, trivial offset
|
|
co.ascDecode( k4,contextsOffsets_ );
|
|
delete in;
|
|
}
|
|
else
|
|
{
|
|
delete[] allLists_;
|
|
delete[] contextsData_;
|
|
delete[] positions_;
|
|
delete positionsFile_;
|
|
throw IOException( rtl::OUString::createFromAscii( "XmlIndex::XmlIndex -> no OFFSETS/offsets") );
|
|
}
|
|
}
|
|
|
|
// reading linknames
|
|
{
|
|
util::RandomAccessStream* in =
|
|
indexAccessor_.getStream( rtl::OUString::createFromAscii( "LINKNAMES" ),
|
|
rtl::OUString::createFromAscii( "r" ) );
|
|
if( ! in )
|
|
{
|
|
delete[] allLists_;
|
|
delete[] contextsData_;
|
|
delete[] positions_;
|
|
delete positionsFile_;
|
|
throw IOException(
|
|
rtl::OUString::createFromAscii( "BtreeDict::BtreeDict -> no LINKNAMES/linknames" ) );
|
|
}
|
|
|
|
sal_Int32 len = in->length();
|
|
char* bff = new char[ 1 + len ], *bff1 = new char[ 1 + len ];
|
|
bff[ len ] = 0;
|
|
in->readBytes( reinterpret_cast<sal_Int8*>( bff ),len );
|
|
delete in;
|
|
|
|
// Now the buffer must be densified.
|
|
int i,len1 = 0;
|
|
for( i = 0; i < len; ++i )
|
|
{
|
|
if( bff[i] )
|
|
bff1[ len1++ ] = bff[i];
|
|
}
|
|
bff1[len1] = 0;
|
|
delete[] bff;
|
|
rtl::OString aStr( bff1 ); // build a string from the densified buffer;
|
|
delete[] bff1;
|
|
|
|
// // Now determine the order
|
|
// #define NAMECOUNT 16
|
|
// #define UNREACHABLEPLACE 100000;
|
|
// /**
|
|
// * The available names cannot be determined from LINKNAMES at current,
|
|
// * because LINKNAMES is a serialized Java-object
|
|
// * Always update LINKNAMES if index.xsl or default.xsl are modified.
|
|
// */
|
|
// rtl::OString LN[NAMECOUNT];
|
|
// LN[0] = "text:span";
|
|
// LN[1] = "help:help-text";
|
|
// LN[2] = "help:to-be-embedded";
|
|
// LN[3] = "headingheading";
|
|
// LN[4] = "office:body";
|
|
// LN[5] = "text:p";
|
|
// LN[6] = "office:document";
|
|
// LN[7] = "help:link";
|
|
// LN[8] = "help:key-word";
|
|
// LN[9] = "table:table";
|
|
// LN[10] = "table:table-header-row";
|
|
// LN[11] = "table:table-row";
|
|
// LN[12] = "table:table-cell";
|
|
// LN[13] = "text:unordered-list";
|
|
// LN[14] = "text:ordered-list";
|
|
// LN[15] = "text:list-item";
|
|
// Now determine the order
|
|
|
|
#define NAMECOUNT 16
|
|
#define UNREACHABLEPLACE 100000;
|
|
/**
|
|
* The available names cannot be determined from LINKNAMES at current,
|
|
* because LINKNAMES is a serialized Java-object
|
|
* Always update LINKNAMES if index.xsl or default.xsl are modified.
|
|
*/
|
|
|
|
rtl::OString LN[NAMECOUNT];
|
|
LN[0] = "helpdocument";
|
|
LN[1] = "body";
|
|
LN[2] = "title";
|
|
LN[3] = "table";
|
|
LN[4] = "tablecell";
|
|
LN[5] = "tablerow";
|
|
LN[6] = "list";
|
|
LN[7] = "listitem";
|
|
LN[8] = "item";
|
|
LN[9] = "emph";
|
|
LN[10] = "paragraph";
|
|
LN[11] = "section";
|
|
LN[12] = "bookmark";
|
|
LN[13] = "bookmark_value";
|
|
LN[14] = "ahelp";
|
|
LN[15] = "link";
|
|
|
|
// Determine index in file
|
|
int idx[NAMECOUNT];
|
|
/*int*/ linkNamesL_ = NAMECOUNT;
|
|
for( i = 0; i < NAMECOUNT; ++i )
|
|
if( ( idx[i] = aStr.indexOf( LN[i] ) ) == -1 ) {
|
|
idx[i] = UNREACHABLEPLACE;
|
|
--linkNamesL_;
|
|
}
|
|
|
|
linkNames_ = new rtl::OUString[linkNamesL_];
|
|
for( i = 0; i < linkNamesL_; ++i ) {
|
|
// TODO what happens to first if we never hit Place?
|
|
int first = 0;
|
|
int Place = UNREACHABLEPLACE; // This is the defintely last place
|
|
for( int j = 0; j < NAMECOUNT; ++j )
|
|
{
|
|
if( idx[j] < Place )
|
|
Place = idx[first = j];
|
|
}
|
|
idx[first] = UNREACHABLEPLACE;
|
|
linkNames_[i] = rtl::OUString( LN[first].getStr(),LN[first].getLength(),RTL_TEXTENCODING_UTF8 );
|
|
}
|
|
|
|
#undef NAMECOUNT
|
|
#undef UNREACHABLEPLACE
|
|
} // end linknames
|
|
|
|
|
|
{
|
|
contextTables_ = new ContextTables(contextsOffsets_,
|
|
contextsDataL_,contextsData_,
|
|
linkNamesL_,linkNames_ );
|
|
}
|
|
}
|
|
|
|
|
|
XmlIndex::~XmlIndex()
|
|
{
|
|
delete[] allLists_;
|
|
delete[] contextsData_;
|
|
delete[] linkNames_;
|
|
delete[] positions_;
|
|
delete positionsFile_;
|
|
delete contextTables_;
|
|
}
|
|
|
|
|
|
|
|
void XmlIndex::reset()
|
|
{
|
|
maxDocNumberInCache_ = allInCache_ ? static_cast<sal_Int32>(microIndexOffsets_.size()) - 1 : -1;
|
|
}
|
|
|
|
|
|
sal_Int32 binarySearch( const std::vector<sal_Int32>& arr,sal_Int32 value )
|
|
{
|
|
sal_Int32 i = 0, j = arr.size(), k;
|
|
while (i <= j)
|
|
if (arr[k = (i + j)/2] < value)
|
|
i = k + 1;
|
|
else if (value < arr[k])
|
|
j = k - 1;
|
|
else
|
|
return k;
|
|
return -1;
|
|
}
|
|
|
|
|
|
NonnegativeIntegerGenerator* XmlIndex::getDocumentIterator( sal_Int32 concept )
|
|
{
|
|
sal_Int32 index = binarySearch( concepts_,concept );
|
|
|
|
if( index >= 0 )
|
|
return new util::ConceptList( allLists_,allListsL_,offsets_[index] );
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
|
|
bool XmlIndex::occursInText( sal_Int32 concept )
|
|
{
|
|
return binarySearch( concepts_,concept) >= 0;
|
|
}
|
|
|
|
|
|
sal_Int8* XmlIndex::getPositions( sal_Int32& len,sal_Int32 docNo ) throw( excep::XmlSearchException )
|
|
{
|
|
contextTables_->setMicroindex( docNo );
|
|
if( docNo > maxDocNumberInCache_ )
|
|
readMicroindexes( docNo );
|
|
|
|
len = positionsL_;
|
|
return positions_;
|
|
}
|
|
|
|
|
|
rtl::OUString XmlIndex::documentName( sal_Int32 docNumber ) throw( excep::XmlSearchException )
|
|
{
|
|
if( docNumber < 0 || documents_.size() <= sal_uInt32( docNumber ) )
|
|
{
|
|
rtl::OUString message = rtl::OUString::createFromAscii( "XmlIndex::documentName -> " );
|
|
throw excep::XmlSearchException( message );
|
|
}
|
|
|
|
return dict_.fetch( documents_[ docNumber ] );
|
|
}
|
|
|
|
|
|
|
|
|
|
void XmlIndex::readMicroindexes( sal_Int32 docNo ) throw( xmlsearch::excep::IOException )
|
|
{
|
|
currentBatchOffset_ = microIndexOffsets_[docNo];
|
|
sal_Int32 offsetLimit = currentBatchOffset_ + positionsL_;
|
|
sal_Int32 upTo = 0, nextDoc = docNo;
|
|
sal_Int32 lastOffset = 0;
|
|
|
|
do
|
|
{
|
|
if( ++nextDoc == sal_Int32( microIndexOffsets_.size() ) )
|
|
lastOffset = sal_Int32( positionsFile_->length() );
|
|
else if( microIndexOffsets_[ nextDoc ] > offsetLimit )
|
|
lastOffset = microIndexOffsets_[ nextDoc ];
|
|
}
|
|
while( lastOffset == 0 );
|
|
|
|
if( lastOffset > offsetLimit )
|
|
{
|
|
upTo = microIndexOffsets_[ nextDoc - 1 ];
|
|
maxDocNumberInCache_ = nextDoc - 2;
|
|
}
|
|
else
|
|
{
|
|
upTo = lastOffset;
|
|
maxDocNumberInCache_ = nextDoc - 1;
|
|
}
|
|
|
|
if( maxDocNumberInCache_ < docNo )
|
|
{ // cache too small
|
|
// for current microindex
|
|
// System.out.println("expanding cache to " + _positionsCacheSize);
|
|
delete[] positions_;
|
|
positions_ = new sal_Int8[ positionsL_ = lastOffset - currentBatchOffset_ ];
|
|
readMicroindexes( docNo );
|
|
return;
|
|
}
|
|
|
|
positionsFile_->seek( currentBatchOffset_ );
|
|
positionsFile_->readBytes( positions_,upTo - currentBatchOffset_ );
|
|
}
|
|
|
|
|
|
QueryHitData* XmlIndex::hitToData( QueryHit* hit )
|
|
{
|
|
sal_Int32 termsL,matchesL;
|
|
sal_Int32 *matches = hit->getMatches( matchesL );
|
|
rtl::OUString *terms = new rtl::OUString[ termsL = matchesL >>/*>*/ 1 ];
|
|
for( sal_Int32 i = 0; i < termsL; ++i )
|
|
{
|
|
sal_Int32 aInt = ( i << 1 );
|
|
if( 0 <= aInt && aInt < matchesL )
|
|
{
|
|
sal_Int32 match = matches[ aInt ];
|
|
if( match > 0 )
|
|
try
|
|
{
|
|
terms[i] = fetch( match );
|
|
}
|
|
catch( const excep::XmlSearchException& )
|
|
{
|
|
}
|
|
}
|
|
}
|
|
|
|
sal_Int32 document = hit->getDocument();
|
|
QueryHitData *res = new QueryHitData( hit->getPenalty(),
|
|
documentName( document ),
|
|
termsL,terms );
|
|
contextTables_->setMicroindex( document );
|
|
contextTables_->resetContextSearch();
|
|
return res;
|
|
}
|
|
|
|
|