726 lines
20 KiB
C++
726 lines
20 KiB
C++
/*************************************************************************
|
|
*
|
|
* OpenOffice.org - a multi-platform office productivity suite
|
|
*
|
|
* $RCSfile: Search.cxx,v $
|
|
*
|
|
* $Revision: 1.10 $
|
|
*
|
|
* last change: $Author: vg $ $Date: 2007-08-30 15:42:59 $
|
|
*
|
|
* The Contents of this file are made available subject to
|
|
* the terms of GNU Lesser General Public License Version 2.1.
|
|
*
|
|
*
|
|
* GNU Lesser General Public License Version 2.1
|
|
* =============================================
|
|
* Copyright 2005 by Sun Microsystems, Inc.
|
|
* 901 San Antonio Road, Palo Alto, CA 94303, USA
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software Foundation.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
* MA 02111-1307 USA
|
|
*
|
|
************************************************************************/
|
|
|
|
// MARKER(update_precomp.py): autogen include statement, do not remove
|
|
#include "precompiled_xmlhelp.hxx"
|
|
#ifndef _XMLSEARCH_QE_SEARCH_HXX_
|
|
#include <qe/Search.hxx>
|
|
#endif
|
|
|
|
|
|
using namespace std;
|
|
using namespace xmlsearch;
|
|
using namespace xmlsearch::qe;
|
|
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
/* */
|
|
/* Emtpy ConceptData/Query */
|
|
/* */
|
|
/******************************************************************************/
|
|
|
|
|
|
class EmptyConceptData
|
|
: public ConceptData
|
|
{
|
|
public:
|
|
|
|
virtual void generateFillers( std::vector< RoleFiller* >& , sal_Int32 ) { }
|
|
|
|
}; // end class EmptyQuery
|
|
|
|
|
|
|
|
class EmptyQuery
|
|
: public Query
|
|
{
|
|
public:
|
|
|
|
EmptyQuery()
|
|
: Query( 0,0,0,0,0 ),
|
|
conceptDataInstance_( )
|
|
{
|
|
}
|
|
|
|
ConceptData* makeConceptData( sal_Int32 col,
|
|
sal_Int32 concept,
|
|
double penalty,
|
|
sal_Int32 queryNo )
|
|
{
|
|
(void)col;
|
|
(void)concept;
|
|
(void)penalty;
|
|
(void)queryNo;
|
|
|
|
return &conceptDataInstance_;
|
|
}
|
|
|
|
private:
|
|
|
|
EmptyConceptData conceptDataInstance_;
|
|
|
|
}; // end class EmptyQuery
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
/* */
|
|
/* ConceptData1/Query1 */
|
|
/* */
|
|
/******************************************************************************/
|
|
|
|
|
|
class ConceptData1
|
|
: public ConceptData
|
|
{
|
|
public:
|
|
|
|
ConceptData1( sal_Int32 id,
|
|
sal_Int32 role,
|
|
double score,
|
|
sal_Int32 queryNo,
|
|
sal_Int32 nColumns,
|
|
ContextTables* ctxInfo,
|
|
sal_Int32 code )
|
|
: ConceptData( id,role,score,queryNo,nColumns,ctxInfo ),
|
|
fieldCode_( code )
|
|
{
|
|
}
|
|
|
|
virtual void generateFillers( std::vector< RoleFiller* >& array, sal_Int32 pos );
|
|
|
|
|
|
private:
|
|
|
|
sal_Int32 fieldCode_;
|
|
|
|
}; // end class ConceptData1
|
|
|
|
|
|
class Query1
|
|
: public Query
|
|
{
|
|
public:
|
|
|
|
Query1( XmlIndex* env,
|
|
sal_Int32 nColumns,
|
|
sal_Int32 nHits,
|
|
sal_Int32 missingPenaltiesL,
|
|
double* missingPenalties,
|
|
sal_Int32 fieldCode )
|
|
: Query( env,nColumns,nHits,missingPenaltiesL,missingPenalties ),
|
|
searchFieldCode_( fieldCode )
|
|
{
|
|
}
|
|
|
|
virtual ConceptData* makeConceptData( sal_Int32 col,
|
|
sal_Int32 concept,
|
|
double score,
|
|
sal_Int32 query );
|
|
|
|
|
|
private:
|
|
|
|
sal_Int32 searchFieldCode_;
|
|
};
|
|
|
|
|
|
|
|
/********************************************************************************/
|
|
// Impl
|
|
/********************************************************************************/
|
|
|
|
|
|
void ConceptData1::generateFillers( std::vector< RoleFiller* >& array, sal_Int32 pos )
|
|
{
|
|
if( array[ queryNo_ ] != RoleFiller::STOP() )
|
|
{ // not 'prohibited'
|
|
// !!! candidate for a single _ctx op
|
|
sal_Int32 ancestor = ctx_->firstParentWithCode(pos,fieldCode_);
|
|
if( ancestor != -1 )
|
|
{
|
|
|
|
|
|
RoleFiller* p = new RoleFiller( nColumns_,
|
|
this,
|
|
role_,
|
|
pos,
|
|
ancestor,
|
|
pos + proximity_);
|
|
p->use( array,queryNo_ );
|
|
}
|
|
}
|
|
|
|
if( next_.is() )
|
|
next_->generateFillers( array,pos );
|
|
}
|
|
|
|
|
|
|
|
ConceptData* Query1::makeConceptData( sal_Int32 col,
|
|
sal_Int32 concept,
|
|
double score,
|
|
sal_Int32 query )
|
|
{
|
|
return new ConceptData1( concept,col,score,query,nColumns_,ctx_,searchFieldCode_ );
|
|
}
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
/* */
|
|
/* QueryFactoryImpl */
|
|
/* */
|
|
/******************************************************************************/
|
|
|
|
|
|
class QueryFactoryImpl
|
|
{
|
|
public:
|
|
|
|
Query* makeQuery( XmlIndex* env,const rtl::OUString& context,sal_Int32 nColumns,sal_Int32 nHits);
|
|
|
|
Query* empty() { return &emptyQueryInstance_; }
|
|
|
|
private:
|
|
|
|
EmptyQuery emptyQueryInstance_;
|
|
|
|
}; // end class QueryFactoryImpl
|
|
|
|
|
|
|
|
|
|
Query* QueryFactoryImpl::makeQuery( XmlIndex* env,
|
|
const rtl::OUString& context,
|
|
sal_Int32 nColumns,
|
|
sal_Int32 nHits )
|
|
{
|
|
if( ! context.getLength() )
|
|
return new Query( env,nColumns,nHits,0,0 );
|
|
else if( context.indexOf( sal_Unicode( '|' ) ) != -1 )
|
|
{
|
|
return 0; //t
|
|
}
|
|
else if( context.indexOf( rtl::OUString::createFromAscii( "//" ) ) != -1 )
|
|
{
|
|
return 0; //t
|
|
}
|
|
else if( context.indexOf( sal_Unicode( '/' ) ) != -1 )
|
|
{
|
|
return 0; //t
|
|
}
|
|
else if( context.indexOf( sal_Unicode( '@' ) ) != -1 )
|
|
{
|
|
return 0; //t
|
|
}
|
|
else if( context.indexOf( sal_Unicode( '[' ) ) != -1 )
|
|
{
|
|
return 0; //t
|
|
}
|
|
else
|
|
{
|
|
sal_Int32 code = env->getContextInfo()->linkCode( context );
|
|
if( code != -1 )
|
|
return new Query1( env,nColumns,nHits,0,0,code);
|
|
else
|
|
return &emptyQueryInstance_;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
Search::Search( XmlIndex* env )
|
|
: env_( env ),
|
|
queryFactory_( 0 ),
|
|
nextDocGenHeap_(),
|
|
firstGenerator_(),
|
|
free2_( 0 ),
|
|
limit_( 0 ),
|
|
base_( 0 ),
|
|
concepts_( new sal_Int32[ ConceptGroupGenerator::NConceptsInGroup ] ),
|
|
dataL_( 0 ),
|
|
data_( 0 )
|
|
{
|
|
}
|
|
|
|
|
|
|
|
Search::~Search()
|
|
{
|
|
sal_uInt32 i;
|
|
Query* stopq = queryFactory_ ? queryFactory_->empty() : 0;
|
|
ConceptData* stopc = stopq ? stopq->makeConceptData( 0,0,0.0,0 ) : 0;
|
|
(void)stopc;
|
|
|
|
for( i = 0; i < queries_.size(); ++i )
|
|
if( queries_[i] != stopq )
|
|
delete queries_[i];
|
|
|
|
delete[] concepts_;
|
|
|
|
delete queryFactory_;
|
|
}
|
|
|
|
|
|
|
|
|
|
Query* Search::addQuery( const rtl::OUString& context,
|
|
sal_Int32 nValidTerms,sal_Int32 nMissingTerms,sal_Int32 nHits,
|
|
double variantPenalty,
|
|
const std::vector< sal_Int32 >& primary,
|
|
const std::vector< std::vector< sal_Int32 > >& columns )
|
|
{
|
|
// by now, scope == context
|
|
if( ! queryFactory_ )
|
|
queryFactory_ = new QueryFactoryImpl();
|
|
|
|
Query* query = queryFactory_->makeQuery( env_,context,nValidTerms,nHits );
|
|
query->missingTerms( nMissingTerms );
|
|
queries_.push_back( query );
|
|
|
|
for( sal_Int32 i = 0; i < nValidTerms; ++i )
|
|
{
|
|
if( primary[i] > 0 )
|
|
addTerm( i,primary[i],0.0 );
|
|
|
|
for( sal_uInt32 j = 0; j < columns[i].size(); ++j )
|
|
addTerm( i,columns[i][j],variantPenalty );
|
|
}
|
|
|
|
// start stop
|
|
query->addControlConceptData( this,queries_.size()-1 );
|
|
return query;
|
|
}
|
|
|
|
|
|
|
|
void Search::startSearch()
|
|
{
|
|
sal_Int32 i,j;
|
|
// set up ConceptData lists
|
|
// order search terms
|
|
quicksort( 0, free2_ - 1);
|
|
|
|
// remove duplicates
|
|
for (i = 0; i < free2_ - 1; i = j)
|
|
{
|
|
for (j = i + 1; j < free2_; j++)
|
|
{
|
|
if( conceptData_[i]->crqEquals( conceptData_[j].get() ) )
|
|
conceptData_[j] = 0;
|
|
else
|
|
i = j;
|
|
}
|
|
}
|
|
|
|
// create lists
|
|
for( i = 0; i < free2_ - 1; i = j )
|
|
{
|
|
for (j = i + 1; j < free2_; j++ )
|
|
{
|
|
if( conceptData_[j].is() )
|
|
if( conceptData_[i]->cEquals( conceptData_[j].get() ) )
|
|
{
|
|
conceptData_[i]->addLast( conceptData_[j].get() );
|
|
conceptData_[j] = 0;
|
|
}
|
|
else
|
|
i = j;
|
|
}
|
|
}
|
|
|
|
// densify
|
|
for( i = 0; i < free2_ - 1; i++)
|
|
{
|
|
if( ! conceptData_[i].is() )
|
|
{
|
|
for( j = i + 1; j < free2_; j++)
|
|
{
|
|
if (conceptData_[j].is() )
|
|
{
|
|
conceptData_[i] = conceptData_[j];
|
|
conceptData_[j] = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// set up new document generators
|
|
nextDocGenHeap_.reset();
|
|
for( i = 0; i < free2_ && conceptData_[i].is(); i++)
|
|
{
|
|
NextDocGenerator* gen = new NextDocGenerator( conceptData_[i].get(),env_ );
|
|
try
|
|
{
|
|
sal_Int32 doc;
|
|
gen->first();
|
|
if( ( doc = gen->getDocument() ) != NonnegativeIntegerGenerator::END )
|
|
{
|
|
/* !!! ??? is concept length used any more in any way
|
|
conceptData_[i].
|
|
setConceptLength(_env.
|
|
getConceptLength(conceptData_[i].getConcept()));
|
|
*/
|
|
nextDocGenHeap_.addGenerator( gen );
|
|
}
|
|
}
|
|
catch( ... )
|
|
{
|
|
}
|
|
}
|
|
|
|
nextDocGenHeap_.start();
|
|
env_->reset();
|
|
env_->resetContextSearch();
|
|
searchDocument();
|
|
}
|
|
|
|
|
|
|
|
|
|
void Search::addTerm( sal_Int32 col,sal_Int32 concept,double score )
|
|
{
|
|
if( env_->occursInText( concept ) )
|
|
{
|
|
ConceptData* cd = queries_[queries_.size()-1]->makeConceptData( col,concept,score,queries_.size()-1 );
|
|
if( sal_uInt32( free2_ ) == conceptData_.size() )
|
|
{
|
|
conceptData_.push_back( 0 );
|
|
// conceptVisitor_ = &conceptData_[0];
|
|
}
|
|
conceptData_[ free2_++ ] = cd;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void Search::searchDocument()
|
|
{
|
|
std::vector< RoleFiller* > start( queries_.size() );
|
|
do
|
|
{
|
|
try
|
|
{
|
|
switch( nextDocument( start ) )
|
|
{
|
|
case 0: // multi group
|
|
genHeap_.start( start );
|
|
while( genHeap_.next( start ) )
|
|
;
|
|
break;
|
|
|
|
case 1: // single group
|
|
while( firstGenerator_.next() )
|
|
firstGenerator_.generateFillers( start );
|
|
break;
|
|
|
|
case 2: // reached the end
|
|
return;
|
|
}
|
|
}
|
|
catch( const excep::XmlSearchException& )
|
|
{
|
|
continue;
|
|
}
|
|
|
|
RoleFiller* next;
|
|
for( sal_uInt32 i = 0; i < queries_.size(); ++i )
|
|
{
|
|
if( ( next = start[i] ) != 0 && next != RoleFiller::STOP() )
|
|
next->scoreList( queries_[i],document_ );
|
|
else if( queries_[i]->zoned() )
|
|
{
|
|
RoleFiller* rfs = queries_[i]->getRoleFillers();
|
|
if( rfs && rfs != RoleFiller::STOP() )
|
|
rfs->scoreList( queries_[i],document_ );
|
|
}
|
|
}
|
|
genHeap_.reset();
|
|
}
|
|
while( nextDocGenHeap_.isNonEmpty() );
|
|
|
|
// #i80952
|
|
#if 0
|
|
for( sal_uInt32 i = 0; i < start.size(); ++i )
|
|
if( start[i] != RoleFiller::STOP() )
|
|
delete start[i];
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
sal_Int32 Search::nextDocument( std::vector< RoleFiller* >& start ) throw( xmlsearch::excep::XmlSearchException )
|
|
{
|
|
while( nextDocGenHeap_.isNonEmpty() )
|
|
{ // still something to do
|
|
sal_uInt32 i;
|
|
for( i = 0; i < queries_.size(); ++i )
|
|
if( queries_[i] )
|
|
queries_[i]->resetForNextDocument();
|
|
|
|
// gather all concepts this document has
|
|
// and store associated conceptData
|
|
sal_Int32 index = 0;
|
|
document_ = nextDocGenHeap_.getDocument();
|
|
docConcepts_.clear();
|
|
queryMasks_.clear();
|
|
do
|
|
{
|
|
docConcepts_.push_back( nextDocGenHeap_.getConcept() );
|
|
queryMasks_.push_back( nextDocGenHeap_.getQueryMask() );
|
|
ConceptData *conceptData = ( conceptData_[ index++ ] = nextDocGenHeap_.getTerms() ).get();
|
|
conceptData->runBy( queries_ );
|
|
nextDocGenHeap_.step();
|
|
}
|
|
while( nextDocGenHeap_.atDocument( document_) );
|
|
|
|
// if there is no saturation model, some query will always vote YES
|
|
// and so every document will be opened
|
|
// even if this case, however, savings can be achieved by not generating fillers
|
|
// for some queries (not scoring, etc)
|
|
// and, with more care, creation of some GroupGenerators can be avoided
|
|
// saturating queries with lots of good hits will lead to best results
|
|
|
|
sal_Int32 voteMask = 0;
|
|
Query* query;
|
|
for( i = 0; i < queries_.size(); ++i )
|
|
{
|
|
query = queries_[i];
|
|
if( query )
|
|
{
|
|
query->saveRoleFillers( 0 );
|
|
if( query->vote() )
|
|
{
|
|
// normal reset
|
|
start[i] = query->zoned() ? RoleFiller::STOP() : 0;
|
|
voteMask |= 1 << i;
|
|
}
|
|
else
|
|
start[i] = RoleFiller::STOP(); // prohibit setting
|
|
}
|
|
}
|
|
|
|
// we may eliminate some ConceptGroupGenerators
|
|
// those which would be used only by Queries which voted NO
|
|
if( voteMask != 0 )
|
|
{ // need to open up document
|
|
ConceptGroupGenerator* gen;
|
|
// !!! don't gather Fillers for disinterested Queries
|
|
if( openDocumentIndex( document_ ) )
|
|
{ // multi group
|
|
// set up all needed generators
|
|
sal_Int32 j = 0;
|
|
while( ( queryMasks_[j] & voteMask ) == 0 )
|
|
++j;
|
|
// assert(j < index);
|
|
sal_Int32 c = docConcepts_[j];
|
|
sal_Int32 group = 0;
|
|
// find first group
|
|
while( /*group < maxConcepts_.size() &&*/
|
|
c > maxConcepts_[ group ] && ++group < limit_ )
|
|
;
|
|
gen = makeGenerator( group );
|
|
gen->addTerms( indexOf(c),conceptData_[j].get() );
|
|
|
|
for( ++j; j < index; j++ )
|
|
if( ( queryMasks_[j] & voteMask ) > 0 )
|
|
{
|
|
c = docConcepts_[j];
|
|
if( c > max_ )
|
|
{ // need to find another group
|
|
// assert(group < _limit);
|
|
while( /*group < maxConcepts_.size() &&*/
|
|
c > maxConcepts_[ group ] && ++group < limit_ )
|
|
;
|
|
gen = makeGenerator( group );
|
|
}
|
|
gen->addTerms( indexOf(c),conceptData_[j].get() );
|
|
}
|
|
return 0;
|
|
}
|
|
else
|
|
{ // single group
|
|
for( sal_Int32 j = 0; j < index; j++ )
|
|
if( queryMasks_[j] & voteMask )
|
|
firstGenerator_.addTerms( indexOf( docConcepts_[j] ),conceptData_[j].get() );
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 2;
|
|
}
|
|
|
|
|
|
|
|
|
|
bool Search::openDocumentIndex( sal_Int32 docNo ) throw( excep::XmlSearchException )
|
|
{
|
|
data_ = env_->getPositions( dataL_,docNo );
|
|
base_ = env_->getDocumentIndex( docNo );
|
|
|
|
startingIndex_ = 0;
|
|
sal_Int32 kk = data_[ base_ ] & 0xFF, k2;
|
|
|
|
switch( kk >> 6 )
|
|
{ // get type
|
|
case 0: // single group, no extents
|
|
k2 = data_[base_ + 1];
|
|
firstGenerator_.init( dataL_,data_,base_ += 2,k2 );
|
|
// decode concept table
|
|
nConcepts_ = firstGenerator_.decodeConcepts( kk & 0x3F,0,concepts_ );
|
|
return false;
|
|
|
|
case 2: // multi group, no extents
|
|
{
|
|
kTable_.clear();
|
|
offsets_.clear();
|
|
maxConcepts_.clear();
|
|
util::ByteArrayDecompressor compr( dataL_,data_,base_ + 1 );
|
|
compr.decode( kk & 0x3F,kTable_ );
|
|
|
|
sal_Int32 last = kTable_.back();
|
|
kTable_.pop_back();
|
|
compr.ascDecode( last,offsets_ );
|
|
last = kTable_.back();
|
|
kTable_.pop_back();
|
|
compr.ascDecode( last,maxConcepts_ );
|
|
|
|
base_ += 1 + compr.bytesRead();
|
|
limit_ = maxConcepts_.size();
|
|
}
|
|
return true;
|
|
|
|
case 1: // single group, extents
|
|
case 3: // multi group, extents
|
|
throw excep::XmlSearchException( rtl::OUString::createFromAscii( "extents not yet implemented\n" ) );
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ConceptGroupGenerator* Search::makeGenerator( sal_Int32 group )
|
|
throw( excep::XmlSearchException )
|
|
{
|
|
sal_Int32 shift,index;
|
|
if( group > 0 )
|
|
{
|
|
index = base_ + offsets_[ group - 1 ];
|
|
shift = maxConcepts_[ group - 1 ];
|
|
}
|
|
else
|
|
{
|
|
index = base_;
|
|
shift = 0;
|
|
}
|
|
|
|
// initialize generator
|
|
ConceptGroupGenerator* gen =
|
|
new ConceptGroupGenerator( dataL_,data_,index,kTable_[ 1 + 2*group ] );
|
|
// decode concept table
|
|
nConcepts_ = gen->decodeConcepts( kTable_[2*group],shift,concepts_ );
|
|
|
|
if( group < limit_ )
|
|
max_ = concepts_[ nConcepts_ ] = maxConcepts_[ group ];
|
|
else
|
|
max_ = concepts_[ nConcepts_ - 1 ];
|
|
|
|
genHeap_.addGenerator( gen );
|
|
startingIndex_ = 0; // in _concepts; lower search index
|
|
return gen;
|
|
}
|
|
|
|
|
|
|
|
sal_Int32 Search::indexOf(sal_Int32 concept) throw( excep::XmlSearchException )
|
|
{
|
|
sal_Int32 i = startingIndex_,j = nConcepts_,k;
|
|
while( i <= j )
|
|
if( concepts_[ k = (i + j)/2 ] < concept )
|
|
i = k + 1;
|
|
else if( concept < concepts_[k] )
|
|
j = k - 1;
|
|
else
|
|
{
|
|
startingIndex_ = k + 1;
|
|
return k;
|
|
}
|
|
throw excep::XmlSearchException( rtl::OUString::createFromAscii( "indexOf not found" ) );
|
|
}
|
|
|
|
|
|
|
|
|
|
sal_Int32 Search::partition( sal_Int32 p,sal_Int32 r )
|
|
{
|
|
rtl::Reference< ConceptData > x = conceptData_[ ((p + r) >> 1) & 0x7FFFFFFF ];
|
|
sal_Int32 i = p - 1, j = r + 1;
|
|
while( true )
|
|
{
|
|
while( x->compareWith( conceptData_[--j].get() ) )
|
|
;
|
|
while( conceptData_[++i]->compareWith( x.get() ) )
|
|
;
|
|
if( i < j )
|
|
{
|
|
rtl::Reference< ConceptData > t = conceptData_[i];
|
|
conceptData_[i] = conceptData_[j];
|
|
conceptData_[j] = t;
|
|
}
|
|
else
|
|
return j;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void Search::quicksort( sal_Int32 p,sal_Int32 r )
|
|
{
|
|
while (p < r)
|
|
{
|
|
sal_Int32 q = partition( p,r );
|
|
quicksort(p, q);
|
|
p = q + 1;
|
|
}
|
|
}
|