Files
libreoffice/xmlhelp/source/com/sun/star/help/HelpLinker.cxx
Oliver Bolte 4b66b10e37 INTEGRATION: CWS ab45_DEV300 (1.10.18); FILE MERGED
2008/02/20 09:47:35 ab 1.10.18.3: RESYNC: (1.10-1.11); FILE MERGED
2008/02/18 10:35:59 ab 1.10.18.2: #i83624# Removed warning
2008/02/18 09:18:43 ab 1.10.18.1: #i83624# Parse help tree file
2008-02-26 06:46:27 +00:00

5705 lines
174 KiB
C++

/*************************************************************************
*
* OpenOffice.org - a multi-platform office productivity suite
*
* $RCSfile: HelpLinker.cxx,v $
*
* $Revision: 1.12 $
*
* last change: $Author: obo $ $Date: 2008-02-26 07:46:27 $
*
* The Contents of this file are made available subject to
* the terms of GNU Lesser General Public License Version 2.1.
*
*
* GNU Lesser General Public License Version 2.1
* =============================================
* Copyright 2005 by Sun Microsystems, Inc.
* 901 San Antonio Road, Palo Alto, CA 94303, USA
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
************************************************************************/
#include "HelpCompiler.hxx"
#include <map>
#include <string.h>
#include <limits.h>
#include <boost/shared_ptr.hpp>
#include <boost/tokenizer.hpp>
#include <libxslt/xslt.h>
#include <libxslt/transform.h>
#include <libxslt/xsltutils.h>
#include <libxslt/functions.h>
#include <libxslt/extensions.h>
#include <unicode/brkiter.h>
#include <unicode/ustring.h>
#include <unicode/ucnv.h>
#include <sal/types.h>
#include <osl/time.h>
#ifdef SYSTEM_EXPAT
#include <expat.h>
#else
#ifndef XmlParse_INCLUDED
#include <expat/xmlparse.h>
#endif
#endif
class JarOutputStream
{
private:
fs::path filename;
std::ostringstream perlline;
public:
JarOutputStream();
void setname(const fs::path &name) { filename = name; }
const fs::path& getname() const { return filename; }
void addFile(const std::string &name, const std::string &key);
void addTree(const std::string &dir, const std::string &key);
void dontCompress(const std::string &key);
void commit();
};
struct Data
{
std::vector<std::string> _idList;
typedef std::vector<std::string>::const_iterator cIter;
void append(const std::string &id)
{
_idList.push_back(id);
}
std::string getString() const
{
std::string ret;
cIter aEnd = _idList.end();
for (cIter aIter = _idList.begin(); aIter != aEnd; ++aIter)
ret += *aIter + ";";
return ret;
}
};
class HelpKeyword
{
private:
typedef std::hash_map<std::string, Data, pref_hash> DataHashtable;
DataHashtable _hash;
public:
void insert(const std::string &key, const std::string &id)
{
Data &data = _hash[key];
data.append(id);
}
void dump(DB* table)
{
DataHashtable::const_iterator aEnd = _hash.end();
for (DataHashtable::const_iterator aIter = _hash.begin(); aIter != aEnd; ++aIter)
{
const std::string &keystr = aIter->first;
DBT key;
memset(&key, 0, sizeof(key));
key.data = const_cast<char*>(keystr.c_str());
key.size = keystr.length();
const Data &data = aIter->second;
std::string str = data.getString();
DBT value;
memset(&value, 0, sizeof(value));
value.data = const_cast<char*>(str.c_str());
value.size = str.length();
table->put(table, NULL, &key, &value, 0);
}
}
};
namespace PrefixTranslator
{
std::string translatePrefix(const std::string &input)
{
if (input.find("vnd.sun.star.help://") == 0)
return std::string("#HLP#") + input.substr(strlen("vnd.sun.star.help://"));
else
return input;
}
}
class IndexAccessor
{
fs::path _dirName;
public:
IndexAccessor(const fs::path &dirName) : _dirName(dirName) {}
IndexAccessor(const IndexAccessor &another) { _dirName = another._dirName; }
fs::path indexFile(const std::string &name) const { return _dirName / name; }
std::ifstream* getLineInput(const std::string &name);
std::fstream* getOutputStream(const std::string &name);
std::vector<unsigned char> readByteArray(const std::string &fileName);
void clear();
std::fstream *getRAF(const std::string &name, bool update) throw( HelpProcessingException );
void createIfNeeded() {}
};
std::ifstream* IndexAccessor::getLineInput(const std::string &name)
{
return new std::ifstream(indexFile(name).native_file_string().c_str());
}
std::fstream* IndexAccessor::getOutputStream(const std::string &name)
{
return new std::fstream(indexFile(name).native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
}
std::vector<unsigned char> IndexAccessor::readByteArray(const std::string &fileName)
{
std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
std::vector<unsigned char> ret(1024*16);
int i=0;
while (in.good())
{
int len = in.readsome((char *)&ret[i], 1024*16);
if (!len)
break;
i += len;
ret.resize(i+1024*16);
}
ret.resize(i);
return ret;
}
std::fstream* IndexAccessor::getRAF(const std::string &name, bool update)
throw( HelpProcessingException )
{
std::fstream *_file = new std::fstream;
fs::path fullname = indexFile(name);
if (!update)
{
_file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::binary);
}
else
{
_file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary);
if (!_file->is_open())
{
HCDBG(std::cerr << "didn't exist" << std::endl);
_file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary);
}
if (!_file->is_open())
{
std::stringstream aStrStream;
aStrStream << "Cannot open " << name << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
}
return _file;
}
void IndexAccessor::clear()
{
#if 0
File thisDir = indexFile(".");
File[] components = thisDir.listFiles();
if (components != null)
for (int i = 0; i < components.length; i++)
components[i].delete();
#endif
}
typedef std::vector< std::string > VectorLines;
class Schema : public IndexAccessor
{
private:
static std::string PartName;
bool _update;
VectorLines _lines;
public:
Schema(const IndexAccessor &index, bool update);
std::ifstream* getSchemaLineInput() { return getLineInput(PartName); }
void read();
Stringtable parameters(const std::string &name) const;
void update(const std::string &partName, const std::string &parameters);
void save();
};
std::string Schema::PartName = "SCHEMA";
class startsWith
{
public:
startsWith(const std::string &in) : str(in) {}
bool operator() ( const std::string &in ) const { return (in.find(str) == 0); }
private:
const std::string &str;
};
void Schema::update(const std::string &partName, const std::string &inparameters)
{
VectorLines::iterator aEnd = std::remove_if(_lines.begin(), _lines.end(), startsWith(partName));
if (aEnd != _lines.end()) _lines.erase(aEnd, _lines.end());
_lines.push_back(partName + " " + inparameters);
}
Stringtable Schema::parameters(const std::string &name) const
{
Stringtable result;
VectorLines::const_iterator aEnd = _lines.end();
for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter)
{
if (aIter->find(name) == 0)
{
boost::char_separator<char> sep(" =");
boost::tokenizer< boost::char_separator<char> > tokens(name, sep);
boost::tokenizer< boost::char_separator<char> >::const_iterator it = tokens.begin();
++it; // skip name
while(it != tokens.end())
{
const std::string &part1 = *it;
++it;
if (it == tokens.end())
break;
const std::string &part2 = *it;
result[part1] = part2;
++it;
}
break;
}
}
return result;
}
Schema::Schema(const IndexAccessor &index, bool inupdate) : IndexAccessor(index),
_update(inupdate)
{
read();
}
#ifdef UNX
#define MAX_LINE PATH_MAX
#else
#define MAX_LINE _MAX_PATH
#endif
void Schema::read()
{
std::ifstream* in = getSchemaLineInput();
char line[MAX_LINE];
// This needs to be replaced with our XML Parser
while (in->getline(line, MAX_LINE))
_lines.push_back(line);
delete in;
}
void Schema::save()
{
if (_update)
{
std::fstream* out = getOutputStream(PartName);
*out << "JavaSearch 1.0\n";
VectorLines::const_iterator aEnd = _lines.end();
for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter)
*out << *aIter << '\n';
delete out;
}
}
class DBPartParameters
{
Schema &_schema;
std::string _partName;
Stringtable _parameters;
protected:
bool parametersKnown() const;
void updateSchema(const std::string &parameters) { _schema.update(_partName, parameters); }
public:
DBPartParameters(Schema &schema, const std::string &partName);
int integerParameter(const std::string &name);
};
DBPartParameters::DBPartParameters(Schema &schema, const std::string &partName)
: _schema(schema), _partName(partName)
{
_parameters = schema.parameters(partName);
}
bool DBPartParameters::parametersKnown() const
{
return !_parameters.empty();
}
int DBPartParameters::integerParameter(const std::string &name)
{
std::istringstream converter(_parameters[name]);
int ret;
converter >> ret;
return ret;
}
class BlockManagerParameters : public DBPartParameters
{
private:
fs::path _file;
int _blockSize;
protected:
int _root;
public:
BlockManagerParameters(Schema &schema, const std::string &partName);
bool readState();
const fs::path& getFile() const { return _file; }
int getBlockSize() const { return _blockSize; }
void setBlockSize(int size) { _blockSize = size; }
int getRootPosition() const { return _root; }
void setRoot(int root) { _root = root; }
void updateSchema(const std::string &params);
};
void BlockManagerParameters::updateSchema(const std::string &params)
{
std::ostringstream tmp;
tmp << "bs=" << _blockSize << " rt=" << _root << " fl=-1 " << params;
DBPartParameters::updateSchema(tmp.str());
}
BlockManagerParameters::BlockManagerParameters(Schema &schema, const std::string &partName)
: DBPartParameters(schema, partName), _root(0)
{
_file = schema.indexFile(partName);
HCDBG(std::cerr << "file name set to " << _file.native_file_string());
readState();
}
bool BlockManagerParameters::readState()
{
if (parametersKnown())
{
_blockSize = integerParameter("bs");
_root = integerParameter("rt");
return true;
}
else
return false;
}
class BtreeDictParameters : public BlockManagerParameters
{
private:
int _id1;
public:
BtreeDictParameters(Schema &schema, const std::string &partName);
int getFreeID() const { return _id1; }
void setFreeID(int id) { _id1 = id; }
void updateSchema();
};
void BtreeDictParameters::updateSchema()
{
std::ostringstream tmp;
tmp << "id1=" << _id1 << " id2=1";
BlockManagerParameters::updateSchema(tmp.str());
}
BtreeDictParameters::BtreeDictParameters(Schema &schema, const std::string &partName)
: BlockManagerParameters(schema, partName)
{
}
int readInt(std::fstream &in)
{
HCDBG(std::cerr << "want to read at " << in.tellg() << std::endl);
int ret = 0;
for (int i = 3; i >= 0; --i)
{
unsigned char byte;
in.read( (char*)&byte, 1 );
ret |= (static_cast<unsigned int>(byte) << (i*8));
HCDBG(fprintf(stderr, "inputting %x ret is now %x\n", byte, ret));
}
return ret;
}
void writeByte(std::fstream &out, unsigned char byte)
{
out.write( (const char *)&byte, 1 );
}
void writeShort(std::fstream &out, int item)
{
for (int i = 1; i >= 0; --i)
{
unsigned char byte = static_cast<unsigned char>((item >> (i*8)));
out.write( (const char*)&byte, 1 );
}
}
void writeInt(std::fstream &out, int item)
{
HCDBG(std::cerr << "want to write at " << out.tellp() << std::endl);
for (int i = 3; i >= 0; --i)
{
unsigned char byte = static_cast<unsigned char>((item >> (i*8)));
HCDBG(fprintf(stderr, "outputting %x in is %x\n", byte, item));
out.write( (const char*)&byte, 1 );
}
}
void readFully(std::fstream &in, std::vector<unsigned char> &_data)
{
in.read((char*)(&_data[0]), _data.size());
}
/**
Base class for (typically btree) blocks to hold either
byte vectors representing graph/tree edges,
or pairs (key, id) for dictionaries
Each block has a header and a data section
*/
class Block
{
public:
static int HEADERLEN;
// length of Block ID in bytes
static int IDLEN;
// number of the block
// used for both referring to the block
// and addresssing the block in file
unsigned int _number;
bool _isLeaf;
// first available byte in data section
int _free;
std::vector<unsigned char> _data;
Block(int blocksize) : _number(0), _isLeaf(true), _free(0)
{
_data.resize(blocksize - HEADERLEN);
}
virtual ~Block() {}
void setBlockNumber(int n) { _number = n; }
virtual void setFree(int free) { _free = free; }
// interpret 4 bytes at 'i' as an integer
int integerAt(int i) const
{
int result = ((((((_data[i]&0xFF)<<8)
|_data[i+1]&0xFF)<<8)
|_data[i+2]&0xFF)<<8)
|_data[i+3]&0xFF;
return result;
}
void setIntegerAt(int i, int value)
{
/*
for (int j = i + 3; j >= i; j--, value >>= 8)
_data[j] = (unsigned char)(value & 0xFF);
*/
_data[i++] = (unsigned char)((value >> 24) & 0xFF);
_data[i++] = (unsigned char)((value >> 16) & 0xFF);
_data[i++] = (unsigned char)((value >> 8) & 0xFF);
_data[i] = (unsigned char)(value & 0xFF);
}
void readIn(std::fstream &in)
{
_number = readInt(in);
int twoFields = readInt(in);
_isLeaf = (twoFields & 0x80000000) != 0;
HCDBG(std::cerr << "read leaf as " << _isLeaf << std::endl);
_free = twoFields & 0x7FFFFFFF;
readFully(in, _data);
}
void writeOut(std::fstream &out) const
{
writeInt(out, _number);
writeInt(out, _free | (_isLeaf ? 0x80000000 : 0));
out.write((const char*)(&_data[0]), _data.size());
}
};
int Block::HEADERLEN = 8;
// length of Block ID in bytes
int Block::IDLEN = 4;
class BtreeDict;
class EntryProcessor;
typedef std::vector<int> IntegerArray;
class DictBlock : public Block
{
public:
DictBlock();
int free() const { return _free + firstEntry(); }
int numberOfEntries() const { return integerAt(0); }
int nthPointer(int n) const { return integerAt(4*(n + 1)); }
int getChildIdx(int index) const;
int entryKeyLength(int i) const { return _data[i] & 0xFF; }
int entryCompression(int i) const { return _data[i + 1] & 0xFF; }
int entryID(int i) const { return integerAt(i + 2); }
int entryLength(int entry) const;
int entryKey(int entry) const;
int firstEntry() const { return 4; }
int nextEntry(int entry) const { return entry + entryLength(entry); }
void restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer);
std::string restoreKey(int entry, std::vector<unsigned char> &buffer);
std::string findID(int id) throw( HelpProcessingException );
void setBlockNumbers(std::vector<int> &blocks) const;
void listBlock();
void doMap(BtreeDict &owner, const EntryProcessor &processor);
void withPrefix(BtreeDict &owner, const std::string &prefix,
size_t prefLen, IntegerArray &result);
};
class BlockFactory;
class BlockProcessor;
class BlockDescriptor
{
public:
Block *_block;
bool _modf;
BlockDescriptor(Block *block) : _block(block), _modf(false) {}
}; // end of BlockDescriptor
class BlockManager
{
private:
static int INCR;
std::fstream _file;
long _blockSize;
bool _update;
BlockFactory *_blockFactory;
std::vector<BlockDescriptor> _blockTab;
public:
BlockManager(const BlockManagerParameters *params,
bool update, BlockFactory *bfactory) throw( HelpProcessingException );
~BlockManager();
Block& accessBlock(int blockNumber);
void setModified(int blNum);
void close();
Block& getNewBlock();
void processBlocks(BlockProcessor &processor);
void mapBlock(Block* block);
void addDescriptor(Block* block) throw( HelpProcessingException );
private:
void writeBlock(const Block &bl);
};
int BlockManager::INCR = 64; // size increment
class EntryProcessor
{
public:
virtual void processEntry(const std::string &string, int id) const = 0;
virtual ~EntryProcessor() {};
};
class BtreeDict
{
public:
static int ENTHEADERLEN;
static int BLOCKSIZE;
static int DATALEN;
static int MaxKeyLength;
static int lastPtrIndex;
protected:
BlockManager *blockManager;
int root;
std::vector<int> blocks;
BtreeDict() {/*empty*/}
~BtreeDict() { delete blockManager; }
BtreeDict(const BtreeDictParameters *params);
void init(const BtreeDictParameters *params, bool update,
BlockFactory *bfactory);
public:
int fetch(const std::string &key);
void close();
private:
std::string fetch(int conceptID);
IntegerArray withPrefix(const std::string &prefix);
public:
DictBlock& accessBlock(int index);
DictBlock& child(const DictBlock &bl, int index) throw( HelpProcessingException );
private:
std::string findID(int blNum, int id);
int find(const DictBlock &bl, std::vector<unsigned char> &key, int index);
int find(const DictBlock &bl, std::vector<unsigned char> &key);
void setBlocks(std::vector<int> &blocks);
void map(const EntryProcessor &processor);
public:
void dumpnode(DictBlock &bl, int level);
};
class BlockFactory
{
public:
virtual Block* makeBlock() const = 0;
virtual ~BlockFactory() {}
};
static int dictcount;
class DictBlockFactory : public BlockFactory
{
public:
Block* makeBlock() const
{
dictcount++;
return new DictBlock;
}
};
BtreeDict::BtreeDict(const BtreeDictParameters *params)
{
init(params, false, new DictBlockFactory());
blocks.resize(params->getFreeID());
setBlocks(blocks);
}
void BtreeDict::dumpnode(DictBlock &bl, int level)
{
if (!bl._isLeaf)
{
fprintf(stderr, "\n");
for (int i = 0; i < level; ++i)
fprintf(stderr, "\t");
fprintf(stderr, "there are %d entries\n", bl.numberOfEntries());
for (int i = 0; i < level; ++i)
fprintf(stderr, "\t");
for (int i = 0; i < bl.numberOfEntries(); ++i)
{
int index = bl.getChildIdx(i);
fprintf(stderr, " %d ", index);
DictBlock &thischild = accessBlock(index);
dumpnode(thischild, level + 1);
}
fprintf(stderr, "\n");
}
}
int BtreeDict::fetch(const std::string &key)
{
HCDBG(std::cerr << "fetching " << key << " from root " << root << std::endl);
DictBlock &bl = accessBlock(root);
int length = key.size();
std::vector<unsigned char> Key(length + 1);
memcpy(&(Key[0]), key.c_str(), length);
Key[length] = 0; // sentinel
return find(bl, Key);
}
std::string BtreeDict::fetch(int conceptID)
{
return findID(blocks[conceptID], conceptID);
}
IntegerArray BtreeDict::withPrefix(const std::string &prefix)
{
IntegerArray result;
accessBlock(root).withPrefix(*this, prefix, prefix.size(), result);
return result;
}
void BtreeDict::close()
{
blockManager->close();
}
void BtreeDict::init(const BtreeDictParameters *params, bool update,
BlockFactory *bfactory)
{
blockManager = new BlockManager(params, update, bfactory);
root = params->getRootPosition();
}
DictBlock& BtreeDict::accessBlock(int index)
{
return (DictBlock&)blockManager->accessBlock(index);
}
DictBlock& BtreeDict::child(const DictBlock &bl, int index) throw( HelpProcessingException )
{
if (bl._isLeaf)
{
std::stringstream aStrStream;
aStrStream << "leaf's can't have children, screwed!" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
return accessBlock(bl.getChildIdx(index));
}
std::string BtreeDict::findID(int blNum, int id)
{
return accessBlock(blNum).findID(id);
}
int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key, int index)
{
HCDBG(std::cerr << "find2: " << bl._isLeaf << " : " << index << " : " << std::endl);
return bl._isLeaf ? 0 : find(child(bl, index), key);
}
int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key)
{
int inputKeyLen = key.size() - 1;
int entryPtr = bl.firstEntry();
int freeSpace = bl.free();
int nCharsEqual = 0;
int compression = 0;
HCDBG(std::cerr << "find1: " << inputKeyLen << " : "
<< entryPtr << " : " << freeSpace << " : " << nCharsEqual << " "
<< compression << std::endl);
for (int entryIdx = 0;;)
{
if (entryPtr == freeSpace)
return find(bl, key, bl.numberOfEntries());
else if (compression == nCharsEqual)
{
int keyLen = bl.entryKeyLength(entryPtr);
int keyPtr = bl.entryKey(entryPtr), i;
for (i = 0; i < keyLen && key[nCharsEqual] == bl._data[keyPtr + i]; i++)
++nCharsEqual;
if (i == keyLen)
{
if (nCharsEqual == inputKeyLen)
return bl.entryID(entryPtr);
}
else if ((key[nCharsEqual]&0xFF) < (bl._data[keyPtr + i]&0xFF))
return find(bl, key, entryIdx);
}
else if (compression < nCharsEqual) // compression dropped
return find(bl, key, entryPtr == freeSpace
? bl.numberOfEntries() : entryIdx);
do
{
entryPtr = bl.nextEntry(entryPtr);
++entryIdx;
}
while (bl.entryCompression(entryPtr) > nCharsEqual);
compression = bl.entryCompression(entryPtr);
}
}
class BlockProcessor
{
protected:
std::vector<int> &blocks;
public:
BlockProcessor(std::vector<int> &_blocks) : blocks(_blocks) {}
virtual void process(const Block &block) = 0;
virtual ~BlockProcessor() {}
};
class DictBlockProcessor : public BlockProcessor
{
public:
DictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {}
void process(const Block &block)
{
((const DictBlock&)block).setBlockNumbers(blocks);
}
};
BlockManager::BlockManager(const BlockManagerParameters *params,
bool update, BlockFactory *bfactory) throw( HelpProcessingException )
: _blockFactory(bfactory)
{
_update = update;
// params.readState();
_blockSize = params->getBlockSize();
HCDBG(std::cerr << "opening " << params->getFile().native_file_string() << std::endl);
if (!update)
{
_file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::binary);
}
else
{
_file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary);
if (!_file.is_open())
{
HCDBG(std::cerr << "didn't exist" << std::endl);
_file.open(params->getFile().native_file_string().c_str(),
std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary);
}
if (!_file.is_open())
{
std::stringstream aStrStream;
aStrStream << "Cannot open " << params->getFile().native_file_string() << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
}
_file.seekg(0, std::ios::end);
long length = _file.tellg();
if (length < 0) length = 0;
_file.seekg(0, std::ios::beg);
_file.clear();
HCDBG(std::cerr << "len is " << length << std::endl);
if (length <= 0 && update)
{
Block* _dummy = bfactory->makeBlock();
_dummy->setBlockNumber(0);
writeBlock(*_dummy);
delete _dummy;
length = _blockSize;
}
_file.seekg(0, std::ios::beg);
int _blockTableSize = (length/_blockSize);
HCDBG(std::cerr << "len is now " << _blockTableSize << std::endl);
for (int i = 0; i < _blockTableSize; ++i)
mapBlock(bfactory->makeBlock());
}
Block& BlockManager::getNewBlock()
{
unsigned int number = _blockTab.size();
Block *bl = _blockFactory->makeBlock();
bl->setBlockNumber(number);
writeBlock(*bl);
addDescriptor(bl);
return *(_blockTab[number]._block);
}
void BlockManager::setModified(int blNum)
{
_blockTab[blNum]._modf = true;
}
void BlockManager::close()
{
if (_update)
{
std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end();
for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin();
aIter != aEnd; ++aIter)
{
if (aIter->_modf)
writeBlock(*(aIter->_block));
}
}
_file.close();
}
void BlockManager::processBlocks(BlockProcessor &processor)
{
std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end();
for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin();
aIter != aEnd; ++aIter)
{
processor.process(*(aIter->_block));
}
}
void BlockManager::mapBlock(Block* block)
{
block->readIn(_file);
addDescriptor(block);
}
void BlockManager::addDescriptor(Block *block) throw( HelpProcessingException )
{
BlockDescriptor desc(block);
_blockTab.push_back(desc);
HCDBG(std::cerr << "numbers are " << block->_number << " " << (_blockTab.size()-1) << std::endl);
if (block->_number != _blockTab.size() - 1)
{
std::stringstream aStrStream;
aStrStream << "totally screwed" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
HCDBG(std::cerr << "addDescriptor blocks are now " << _blockTab.size() << std::endl);
}
void BlockManager::writeBlock(const Block &bl)
{
_file.seekp(_blockSize * bl._number);
bl.writeOut(_file);
}
Block& BlockManager::accessBlock(int blockNumber)
{
return *(_blockTab[blockNumber]._block);
}
BlockManager::~BlockManager()
{
std::vector<BlockDescriptor>::iterator aEnd = _blockTab.end();
for (std::vector<BlockDescriptor>::iterator aIter = _blockTab.begin();
aIter != aEnd; ++aIter)
{
delete aIter->_block;
}
delete _blockFactory;
}
void BtreeDict::setBlocks(std::vector<int> &inblocks)
{
DictBlockProcessor foo(inblocks);
blockManager->processBlocks(foo);
}
// can go to Full
void BtreeDict::map(const EntryProcessor &processor)
{
accessBlock(root).doMap(*this, processor);
}
void DictBlock::restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer)
{
int howMany = entryKeyLength(entry);
int where = entryCompression(entry);
int from = entryKey(entry);
while (howMany-- > 0)
buffer[where++] = _data[from++];
}
std::string DictBlock::restoreKey(int entry, std::vector<unsigned char> &buffer)
{
int howMany = entryKeyLength(entry);
int where = entryCompression(entry);
int from = entryKey(entry);
while (howMany-- > 0)
buffer[where++] = _data[from++];
return std::string((const char*)(&buffer[0]), 0, where);
}
std::string DictBlock::findID(int id) throw( HelpProcessingException )
{
std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
int freeSpace = free();
for (int ent = firstEntry(); ent < freeSpace; ent = nextEntry(ent))
{
if (entryID(ent) == id) // found
return restoreKey(ent, buffer);
else
restoreKeyInBuffer(ent, buffer);
}
std::stringstream aStrStream;
aStrStream << "ID not found in block" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
void DictBlock::setBlockNumbers(std::vector<int> &blocks) const
{
for (int e = firstEntry(); e < _free; e = nextEntry(e))
blocks[entryID(e)] = _number;
}
void DictBlock::listBlock()
{
std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
int freeSpace = free();
int entryPtr = firstEntry();
if (_isLeaf)
{
while (entryPtr < freeSpace)
{
std::cout << restoreKey(entryPtr, buffer) << " " <<
entryID(entryPtr);
entryPtr = nextEntry(entryPtr);
}
}
else
std::cout << "not leaf" << std::endl;
}
void DictBlock::doMap(BtreeDict &owner, const EntryProcessor &processor)
{
std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
int freeSpace = free();
int entryPtr = firstEntry();
if (_isLeaf)
{
while (entryPtr < freeSpace)
{
processor.processEntry(restoreKey(entryPtr, buffer),
entryID(entryPtr));
entryPtr = nextEntry(entryPtr);
}
}
else
{
int entryIdx = 0;
while (entryPtr < freeSpace)
{
owner.accessBlock(getChildIdx(entryIdx)).doMap(owner,processor);
processor.processEntry(restoreKey(entryPtr, buffer),
entryID(entryPtr));
entryPtr = nextEntry(entryPtr);
++entryIdx;
}
owner.accessBlock(getChildIdx(entryIdx)).doMap(owner, processor);
}
}
void DictBlock::withPrefix(BtreeDict &owner, const std::string &prefix,
size_t prefLen, IntegerArray &result)
{
std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
int freeSpace = free();
int entryPtr = firstEntry();
if (_isLeaf)
{
while (entryPtr < freeSpace)
{
if (restoreKey(entryPtr, buffer).find(prefix) == 0)
result.push_back(entryID(entryPtr));
entryPtr = nextEntry(entryPtr);
}
}
else
{
int entryIndex = 0;
while (entryPtr < freeSpace)
{
std::string key = restoreKey(entryPtr, buffer);
if (key.size() > prefLen)
key = key.substr(0, prefLen);
int cmp = key.compare(prefix);
if (cmp < 0)
{
entryPtr = nextEntry(entryPtr);
++entryIndex;
}
else if (cmp == 0)
{
result.push_back(entryID(entryPtr));
owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result);
entryPtr = nextEntry(entryPtr);
++entryIndex;
}
else
{
owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result);
return;
}
}
owner.accessBlock(getChildIdx(numberOfEntries())).withPrefix(owner, prefix, prefLen, result);
}
}
int BtreeDict::ENTHEADERLEN = 6;
int BtreeDict::BLOCKSIZE = 2048;
int BtreeDict::DATALEN = BtreeDict::BLOCKSIZE - Block::HEADERLEN;
int BtreeDict::MaxKeyLength = 255;
//!!! Careful with that number, Eugene
int BtreeDict::lastPtrIndex = 508;
DictBlock::DictBlock() : Block(BtreeDict::BLOCKSIZE)
{
}
int DictBlock::getChildIdx(int index) const
{
return nthPointer(BtreeDict::lastPtrIndex - index);
}
int DictBlock::entryLength(int entry) const
{
return BtreeDict::ENTHEADERLEN + entryKeyLength(entry);
}
int DictBlock::entryKey(int entry) const
{
return entry + BtreeDict::ENTHEADERLEN;
}
void setBlockNumber2(std::vector<int> &blocks, size_t index, int number)
{
if (index >= blocks.size())
blocks.resize(index + 1000);
blocks[index] = number;
}
class Entry
{
public:
std::vector<unsigned char> key;
int id;
int block;
Entry(const std::vector<unsigned char> &keyin, int length, int idin) : key(length+1), id(idin), block(-1)
{
memcpy(&key[0], &keyin[0], length);
}
Entry(const std::string &keyin, int idin) : key(keyin.size()+1), id(idin), block(-1)
{
memcpy(&key[0], keyin.c_str(), keyin.size());
}
bool smallerThan(const Entry &other)
{
for (size_t i = 0; i < std::min(key.size(), other.key.size()); i++)
if (key[i] != other.key[i])
return (key[i]&0xFF) < (other.key[i]&0xFF);
return false;
}
}; // end of internal class Entry
class FullDictBlock;
class FullBtreeDict : public BtreeDict
{
protected:
BtreeDictParameters *_params;
bool update;
public:
FullBtreeDict(BtreeDictParameters &params, bool update);
void store(const std::string &bla, int id) throw( HelpProcessingException );
boost::shared_ptr<Entry> insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent);
boost::shared_ptr<Entry> insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
throw( HelpProcessingException );
FullDictBlock& getNewBlock();
void setModified(Block &bl);
void close(int freeID);
};
class FullDictBlock : public DictBlock
{
public:
virtual void setFree(int free);
void setNumberOfEntries(int n) { setIntegerAt(0, n); }
void setChildIndex(int index, int value)
{
setIntegerAt(4*(BtreeDict::lastPtrIndex - index + 1), value);
}
void setEntryID(int i, int id) { setIntegerAt(i + 2, id); }
void setBlockNumbers(std::vector<int> &blocks) const;
bool insert(const Entry &entry);
void makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr);
bool insert(const Entry &ent, int entryPtr, int compr1, int compr2, int index);
int insertInternal(const Entry &entry);
boost::shared_ptr<Entry> split(FullDictBlock &newbl);
void initInternal(int leftBlock, const Entry &entry);
bool insert(boost::shared_ptr<Entry> entry);
bool insert(boost::shared_ptr<Entry> ent, int entryPtr,
int compr1, int compr2, int index);
};
void FullDictBlock::initInternal(int leftBlock, const Entry &entry)
{
_isLeaf = false;
setNumberOfEntries(1);
setChildIndex(0, leftBlock);
setChildIndex(1, entry.block);
int ent = firstEntry();
makeEntry(ent, entry.key, entry.id, entry.key.size() - 1, 0);
setFree(nextEntry(ent));
}
void FullDictBlock::setFree(int infree)
{
_free = infree - firstEntry();
_data[infree] = _data[infree + 1] = 0; // sentinel
}
boost::shared_ptr<Entry> FullDictBlock::split(FullDictBlock& newbl)
{
std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
int freeSpace = free();
int half = freeSpace/2;
int index = 0; // of middle entry
newbl._isLeaf = _isLeaf;
int ent;
for (ent = firstEntry(); ent < half; ent = nextEntry(ent))
{
restoreKeyInBuffer(ent, buffer);
++index;
}
int entriesToMove = numberOfEntries() - index - 1;
// middle entry
restoreKeyInBuffer(ent, buffer);
int len = entryKeyLength(ent) + entryCompression(ent);
boost::shared_ptr<Entry> result(new Entry(buffer, len, entryID(ent)));
result->block = newbl._number;
int newFree = ent;
// rest goes to the new block
ent = nextEntry(ent);
restoreKeyInBuffer(ent, buffer);
len = entryKeyLength(ent) + entryCompression(ent);
int nptr = firstEntry();
newbl.makeEntry(nptr, buffer, entryID(ent), len, 0);
ent = nextEntry(ent);
memmove(&(newbl._data[newbl.nextEntry(nptr)]), &(_data[ent]), freeSpace - ent);
newbl.setNumberOfEntries(entriesToMove);
newbl.setFree(newbl.nextEntry(nptr) + freeSpace - ent);
if (_isLeaf == false) // need to split pointers
{
int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1);
int to = from + 4*(index + 1);
memmove(&(newbl._data[to]), &(_data[from]), 4*(entriesToMove + 1));
}
// this entry will end here
setFree(newFree);
setNumberOfEntries(index);
return result;
//!!!remember updating ID -> string association
}
void FullDictBlock::setBlockNumbers(std::vector<int> &blocks) const
{
for (int e = firstEntry(); e < _free; e = nextEntry(e))
setBlockNumber2(blocks, entryID(e), _number);
}
bool FullDictBlock::insert(boost::shared_ptr<Entry> ent, int entryPtr,
int compr1, int compr2, int index)
{
const std::vector<unsigned char> &key = ent->key;
int keyLen = key.size() - 1 - compr1;
int freeSpace = free();
// calculate how much space is needed to add the new entry
// first, how many bytes are needed for just the new entry
int demand = BtreeDict::ENTHEADERLEN + keyLen;
// adding an entry can increase compression in the following entry
int increase = 0;
if (entryPtr < freeSpace)
if (entryCompression(entryPtr) < compr2)
increase = compr2 - entryCompression(entryPtr);
/*
std::cerr << "key " << key << std::endl;
std::cerr << "entryPtr " << entryPtr << std::endl;
std::cerr << "compr1 " << compr1) << std::endl;
std::cerr << "compr2 " << compr2) << std::endl;
std::cerr << "index " << index) << std::endl;
std::cerr << "demand " << demand) << std::endl;
std::cerr << "increase " << increase) << std::endl;
*/
// check if enough space is available
int limit = _isLeaf ? BtreeDict::DATALEN-2 : 4*(BtreeDict::lastPtrIndex-numberOfEntries()-1);
if (freeSpace + demand - increase <= limit) // 2 for sentinel
{
if (entryPtr < freeSpace)
{
// need to shift extant entries forward
int toMove = increase > 0 ? entryPtr + BtreeDict::ENTHEADERLEN + increase : entryPtr;
// move entries
memmove(&(_data[toMove + demand - increase]), &(_data[toMove]), freeSpace - toMove);
if (increase > 0)
{
// update header
unsigned char tmp = static_cast<unsigned char>(increase);
_data[entryPtr] = _data[entryPtr] - tmp;
_data[entryPtr + 1] = _data[entryPtr + 1] + tmp;
// shift header
memmove(&(_data[entryPtr + demand]), &(_data[entryPtr]), BtreeDict::ENTHEADERLEN);
}
}
// now write the new entry in the space made above
makeEntry(entryPtr, key, ent->id, keyLen, compr1);
if (_isLeaf == false)
{
int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1);
memmove(&(_data[from - 4]), &(_data[from]), 4*(numberOfEntries() - index));
setChildIndex(index + 1, ent->block);
}
setFree(freeSpace + demand - increase);
setNumberOfEntries(numberOfEntries() + 1);
/*
System.err.println("------------list--------------");
byte[] buffer = new byte[MaxKeyLength];
final int freeSpace2 = free();
int entryPtr2 = firstEntry();
while (entryPtr2 < freeSpace2)
{
System.err.println(entryPtr2);
System.err.println(entryKeyLength(entryPtr2));
System.err.println(entryCompression(entryPtr2));
System.err.println(new String(_data,
entryKey(entryPtr2),
entryKeyLength(entryPtr2)));
System.err.println(restoreKey(entryPtr2, buffer)+" "+
entryID(entryPtr2));
entryPtr2 = nextEntry(entryPtr2);
}
System.err.println("------------end--------------");
*/
return true;
}
else
return false;
}
// finds the place and context
bool FullDictBlock::insert(boost::shared_ptr<Entry> entry)
{
const std::vector<unsigned char> &inkey = entry->key;
int inputKeyLen = inkey.size() - 1;
int freeSpace = free();
int entryPtr = firstEntry();
int nCharsEqual = 0;
int prevNCEqual = 0;
int compression = 0;
for (int entryIndex = 0;;)
{
if (entryPtr == freeSpace)
return insert(entry, entryPtr, nCharsEqual, 0, numberOfEntries());
else if (compression == nCharsEqual)
{
int keyLen = entryKeyLength(entryPtr);
int keyPtr = entryKey(entryPtr), i;
prevNCEqual = nCharsEqual;
for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++)
++nCharsEqual;
if (i == keyLen)
{
if (nCharsEqual == inputKeyLen)
{
HCDBG(std::cerr << "setting to " << entry->id << std::endl);
setEntryID(entryPtr, entry->id);
return true;
}
}
else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF))
return insert(entry, entryPtr, prevNCEqual, nCharsEqual, entryIndex);
}
else if (compression < nCharsEqual) // compression dropped
{
int index = entryPtr == freeSpace ? numberOfEntries() : entryIndex;
return insert(entry, entryPtr, nCharsEqual, compression, index);
}
do
{
entryPtr = nextEntry(entryPtr);
++entryIndex;
}
while (entryCompression(entryPtr) > nCharsEqual);
compression = entryCompression(entryPtr);
}
}
static int fulldictcount;
class FullDictBlockFactory : public BlockFactory
{
public:
Block* makeBlock() const
{
fulldictcount++;
return new FullDictBlock;
}
};
class FullDictBlockProcessor : public BlockProcessor
{
public:
FullDictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {}
void process(const Block &block)
{
((const FullDictBlock&)block).setBlockNumbers(blocks);
}
};
FullBtreeDict::FullBtreeDict(BtreeDictParameters &params, bool _update) :
_params(&params), update(_update)
{
init(_params, update, new FullDictBlockFactory());
HCDBG(std::cerr << "id is " << params.getFreeID() << std::endl);
blocks.resize(params.getFreeID());
FullDictBlockProcessor foo(blocks);
blockManager->processBlocks(foo);
/*
if (logging)
log = new FileWriter("/tmp/FullBtreeDict.log");
*/
}
void FullBtreeDict::setModified(Block &bl)
{
blockManager->setModified(bl._number);
}
FullDictBlock& FullBtreeDict::getNewBlock()
{
FullDictBlock &nbl = (FullDictBlock&)blockManager->getNewBlock();
setModified(nbl);
return nbl;
}
boost::shared_ptr<Entry> FullBtreeDict::insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
throw( HelpProcessingException )
{
setModified(bl); // to be modified in any case
if (bl.insert(ent))
return boost::shared_ptr<Entry>();
else
{
FullDictBlock &nbl = getNewBlock();
boost::shared_ptr<Entry> middle = bl.split(nbl);
nbl.setBlockNumbers(blocks);
if ((middle->smallerThan(*ent) ? nbl : bl).insert(ent) == false)
{
std::stringstream aStrStream;
aStrStream << "entry didn't fit into a freshly split block" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
return middle;
}
}
void FullDictBlock::makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr)
{
_data[entry] = static_cast<unsigned char>(length);
_data[entry + 1] = static_cast<unsigned char>(compr);
setEntryID(entry, id);
memmove(&(_data[entryKey(entry)]), &(key[compr]), length);
}
int FullDictBlock::insertInternal(const Entry &entry)
{
const std::vector<unsigned char> &inkey = entry.key;
int inputKeyLen = inkey.size() - 1;
int entryPtr = firstEntry();
int freeSpace = free();
int nCharsEqual = 0;
int compression = 0;
for (int entryIndex = 0;;)
{
if (entryPtr == freeSpace)
return numberOfEntries();
else if (compression == nCharsEqual)
{
int i;
int keyLen = entryKeyLength(entryPtr);
int keyPtr = entryKey(entryPtr);
for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++)
++nCharsEqual;
if (i == keyLen)
{
if (nCharsEqual == inputKeyLen)
{
setEntryID(entryPtr, entry.id);
return -1;
}
}
else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF))
return entryIndex;
}
else if (compression < nCharsEqual) // compression dropped
return entryPtr >= freeSpace ? numberOfEntries() : entryIndex;
do
{
entryPtr = nextEntry(entryPtr);
++entryIndex;
}
while (entryCompression(entryPtr) > nCharsEqual);
compression = entryCompression(entryPtr);
}
}
/*
delegation to powerful primitives at the FullDictBlock level lets us
express the insertion algorithm very succintly here
*/
boost::shared_ptr<Entry> FullBtreeDict::insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
{
if (bl._isLeaf)
ent = insertHere(bl, ent);
else
{
int index = bl.insertInternal(*ent);
if (index != -1)
{
ent = insert((FullDictBlock&)child(bl, index), ent);
if (ent.get())
ent = insertHere(bl, ent);
}
}
return ent;
}
void FullBtreeDict::store(const std::string &key, int id) throw( HelpProcessingException )
{
HCDBG(std::cerr << "so storing " << key << " id " << id << std::endl);
if (key.size() >= 250)
{
std::stringstream aStrStream;
aStrStream << "token " << key << " too long" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
boost::shared_ptr<Entry> aTemp(new Entry(key, id));
FullDictBlock &rBlock = (FullDictBlock&)accessBlock(root);
boost::shared_ptr<Entry> entry = insert(rBlock, aTemp);
if (entry.get())
{
// new root; writing to params needed
FullDictBlock &nbl = getNewBlock();
nbl.initInternal(root, *entry);
setBlockNumber2(blocks, entry->id, root = nbl._number);
_params->setRoot(root);
}
}
void FullBtreeDict::close(int freeID)
{
_params->setFreeID(freeID);
if (update)
_params->updateSchema();
BtreeDict::close();
/*
if (logging)
log.close();
*/
}
class ConceptLocation
{
public:
int _concept;
int _begin;
int _end;
public:
ConceptLocation(int conceptID, int begin, int end);
static void sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2);
static void sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2);
int getConcept() const { return _concept; }
void setConcept(int concept) { _concept = concept; }
int getBegin() const { return _begin; }
int getEnd() const { return _end; }
int getLength() const { return _end - _begin; }
bool equals(const ConceptLocation &other) const
{
return _concept==other._concept&&_begin==other._begin&&_end==other._end;
}
};
class DocumentCompressor;
class Index : public IndexAccessor
{
protected:
typedef std::hash_map<std::string, int, pref_hash> IndexHashtable;
bool _update;
IndexHashtable _cache;
Schema *_schema;
private:
BtreeDictParameters *_dictParams;
FullBtreeDict *_dict;
int _freeID;
std::fstream *_positionsFile;
std::fstream *_offsetsFile;
DocumentCompressor *_documentCompressor;
IntegerArray _concepts;
IntegerArray _offsets;
std::vector<unsigned char> _allLists; // POSITIONS
void readDocumentsTable(const std::string &fileName);
void readOffsetsTables(const std::string &fileName);
void readPositions();
protected:
IntegerArray _microIndexOffsets;
IntegerArray _documents;
IntegerArray _titles;
std::vector<unsigned char> _positions;
private:
int _positionsCacheSize;
int _currentBatchOffset;
bool _allInCache;
protected:
virtual void writeOutOffsets();
public:
Index(const fs::path &indexName, bool update);
virtual ~Index();
void init();
int intern(const std::string &name);
std::fstream& getPositionsFile();
std::fstream& getOffsetsFile();
DocumentCompressor& getDocumentCompressor();
virtual void compress(int docID, int titleID,
std::vector<ConceptLocation> &locations,
std::vector<ConceptLocation> &extents);
void close();
};
Index::Index(const fs::path &indexName, bool update) : IndexAccessor(indexName),
_update(update), _cache(256), _schema(NULL), _dictParams(NULL), _dict(NULL), _positionsFile(0), _offsetsFile(0), _documentCompressor(0),
_positionsCacheSize(0), _currentBatchOffset(0), _allInCache(false)
{
}
class CompressorIterator;
class Decompressor
{
private:
static int BitsInByte;
static int NBits;
int _readByte;
int _toRead;
int _path;
protected:
virtual int getNextByte() = 0;
virtual void initReading() { _toRead = 0; _path = 0; }
private:
int countZeroes();
// reads 1 bit; returns non-0 for bit "1"
int read();
public:
int read(int kBits);
void beginIteration() { _path = 0; }
bool readNext(int k, CompressorIterator &it);
void decode(int k, IntegerArray &array);
void ascDecode(int k, IntegerArray &array);
int ascendingDecode(int k, int start, std::vector<int> &array);
virtual ~Decompressor() {}
};
int Decompressor::BitsInByte = 8;
int Decompressor::NBits = 32;
class ByteArrayDecompressor : public Decompressor
{
private:
const std::vector<unsigned char> *_array;
int _index;
int _index0;
public:
ByteArrayDecompressor(const std::vector<unsigned char> *array, int index) { initReading(array, index); }
using Decompressor::initReading;
virtual void initReading(const std::vector<unsigned char> *array, int index)
{
_array = array;
_index = _index0 = index;
Decompressor::initReading();
}
int bytesRead() { return _index - _index0; }
protected:
int getNextByte()
{
int ret = (*_array)[_index] & 0xFF;
HCDBG(fprintf(stderr, "ByteArrayDecompressor::getNextByte of %d at index %d\n", ret, _index));
_index++;
return ret;
}
};
bool isExtensionMode( void );
class IndexInverter;
class MicroIndex
{
public:
static int RANGE;
static int NConcepts;
private:
int _currentRange;
int _documentNumber;
std::vector<int> _concepts;
short _group;
short _ix;
IntegerArray _kTable;
IntegerArray _offsets;
IntegerArray _maxConcepts;
const std::vector<unsigned char> *_data;
int _base;
int _limit;
int _nc;
ByteArrayDecompressor _decmp;
public:
MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index);
bool smallerThan(const MicroIndex &other)
{
return _currentRange < other._currentRange ||
_currentRange == other._currentRange &&
_documentNumber < other._documentNumber;
}
private:
bool next()
{
if (_group <= _limit)
{
int shift, index;
if (_group > 0)
{
index = _base + _offsets[_group - 1];
shift = _maxConcepts[_group - 1];
}
else
{
index = _base;
shift = 0;
}
_decmp.initReading(_data, index);
_nc = _decmp.ascendingDecode(_kTable[_group*2], shift, _concepts);
HCDBG(std::cerr << "nc b set to " << _nc << std::endl);
if (_group < _limit)
{
HCDBG(fprintf(stderr, "microindex concept index %d set to %d\n", _nc, _maxConcepts[_group]));
_concepts[_nc++] = _maxConcepts[_group];
}
_currentRange = _concepts[_ix = 0]/RANGE;
_group++;
return true;
}
else
return false;
}
void openDocumentIndex()
{
unsigned int kk = (*_data)[_base] & 0xFF;
HCDBG(std::cerr << "openDocumentIndex, kk is " << kk
<< " base is " << _base << std::endl);
switch (kk >> 6) // get type
{
case 0: // single group, no extents
_decmp.initReading(_data, _base += 2);
_nc = _decmp.ascendingDecode(kk & 0x3F, 0, _concepts);
HCDBG(std::cerr << "nc a set to " << _nc << std::endl);
_currentRange = _concepts[_ix = 0]/RANGE;
_limit = 0;
_group = 1;
break;
case 2: // multi group, no extents
{
_decmp.initReading(_data, _base + 1);
_decmp.decode(kk & 0x3F, _kTable);
int last = _kTable.back();
_kTable.pop_back();
_decmp.ascDecode(last, _offsets);
last = _kTable.back();
_kTable.pop_back();
_decmp.ascDecode(last, _maxConcepts);
_base += 1 + _decmp.bytesRead();
_limit = _maxConcepts.size();
_group = 0;
next();
}
break;
case 1: // single group, extents
case 3: // multi group, extents
if( !isExtensionMode() )
std::cerr << "extents not yet implemented" << std::endl;
break;
}
}
public:
bool process(IndexInverter &lists);
};
int MicroIndex::RANGE = 1024;
int MicroIndex::NConcepts = 16;
class BitBuffer
{
private:
static int InitSize;
static int NBits;
static int BitsInByte;
static int BytesInInt;
int _avail;
unsigned int _word;
int _free;
int _size;
std::vector<unsigned int> _array;
public:
BitBuffer() : _avail(NBits), _word(0), _free(0), _size(InitSize)
{
_array.resize(InitSize);
}
void close()
{
if (_avail < NBits)
store(_word << _avail);
else
_avail = 0;
}
void write(std::fstream &out) const
{
for (int i = 0; i < _free - 1; i++)
writeInt(out, _array[i]);
unsigned int word = _array[_free - 1];
int bytes = BytesInInt - _avail/BitsInByte;
int shift = NBits;
while (bytes-- > 0)
writeByte(out, static_cast<unsigned char>((word >> (shift -= BitsInByte)) & 0xFF));
}
void clear()
{
_word = 0;
_avail = NBits;
_free = 0;
}
int byteCount() { return _free*BytesInInt - _avail/BitsInByte; }
int bitCount() { return _free*NBits - _avail; }
void setFrom(const BitBuffer &rhs)
{
_word = rhs._word;
_avail = rhs._avail;
if ((_free = rhs._free) > _size)
_array.resize(_size = rhs._free);
_array = rhs._array;
}
private:
void growArray(int newSize)
{
_array.resize(newSize);
_size = newSize;
}
void store(unsigned int value)
{
if (_free == _size)
growArray(_size * 2);
HCDBG(fprintf(stderr, "store of %x to %d\n", (int)value, _free));
_array[_free++] = value;
}
public:
void append(int bit)
{
_word = (_word << 1) | bit;
if (--_avail == 0)
{
store(_word);
_word = 0;
_avail = NBits;
}
}
void append(unsigned int source, int kBits)
{
if (kBits < _avail)
{
_word = (_word << kBits) | source;
_avail -= kBits;
}
else if (kBits > _avail)
{
int leftover = kBits - _avail;
store((_word << _avail) | (source >> leftover));
_word = source;
_avail = NBits - leftover;
}
else
{
store((_word << kBits) | source);
_word = 0;
_avail = NBits;
}
}
void concatenate(const BitBuffer &bb)
{
if (_size - _free < bb._free)
growArray(_free + bb._free + 1);
if (_avail == 0)
{
memmove(&_array[_free], &bb._array[0], bb._free * sizeof(unsigned int));
_avail = bb._avail;
_free += bb._free;
HCDBG(fprintf(stderr, "free bumped to %d\n", _free));
}
else
{
int tp = _free - 1; // target
int sp = 0; // source
do
{
_array[tp] |= bb._array[sp] >> (NBits - _avail);
_array[++tp] = bb._array[sp++] << _avail;
}
while (sp < bb._free);
_free += bb._free;
if ((_avail += bb._avail) >= NBits)
{
_avail -= NBits;
_free--;
}
HCDBG(fprintf(stderr, "other free bumped to %d\n", _free));
}
}
};
class Compressor
{
private:
static int NBits;
static int BeginK;
BitBuffer _buffer;
public:
void write(std::fstream &out) const { _buffer.write(out); }
int byteCount() { return _buffer.byteCount(); }
void clear() { _buffer.clear(); }
void concatenate(const Compressor &other) { _buffer.concatenate(other._buffer); }
void encode(const IntegerArray &pos, int k);
void encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2);
// k: starting value for minimization
int minimize(const IntegerArray &array, int startK);
int compressAscending(const IntegerArray &array);
};
void toDifferences(const IntegerArray &in, IntegerArray &out)
{
if (out.size() < in.size())
out.resize(in.size());
if (in.empty())
return;
out[0] = in[0];
for (size_t i = 1; i < in.size(); ++i)
out[i] = in[i] - in[i - 1];
}
class IndexInverter
{
private:
static int K;
std::vector<IntegerArray> _arrays;
int _minConcept;
int _limit;
IntegerArray _concepts;
IntegerArray _offsets;
Compressor _compr;
IntegerArray _diffs;
std::fstream *_mainFile;
// heap
int _heapSize;
std::vector<MicroIndex*> _heap;
Index &_index;
public:
IndexInverter(Index &index) : _arrays(MicroIndex::RANGE),
_minConcept(0), _limit(MicroIndex::RANGE),
_mainFile(0), _heapSize(0), _index(index) {}
~IndexInverter()
{
delete _mainFile;
for (int i = 0; i < _heapSize; i++)
{
HCDBG(fprintf(stderr, "deleting number %d\n", i));
delete _heap[i];
}
}
void invertIndex(int nDocuments, const IntegerArray &microIndexOffsets)
{
_mainFile = _index.getOutputStream("DOCS");
for (int i = 0; i < MicroIndex::RANGE; i++)
_arrays[i] = IntegerArray();
// read in the whole POSITIONS file
std::vector<unsigned char> positions = _index.readByteArray("POSITIONS");
// build heap
_heap.clear();
_heap.resize(_heapSize = nDocuments);
for (int i = 0; i < nDocuments; i++)
_heap[i] = new MicroIndex(i, &positions, microIndexOffsets[i]);
for (int i = _heapSize/2; i >= 0; i--)
heapify(i);
// process till exhausted
while (!_heap.empty())
if (_heap[0]->process(*this))
heapify(0);
else if (_heapSize > 1)
{
delete _heap[0];
_heap[0] = _heap[--_heapSize];
heapify(0);
}
else
break;
// closing
flush();
_mainFile->close();
// compress index file
std::fstream *indexFile = _index.getOutputStream("DOCS.TAB");
unsigned char byte = static_cast<unsigned char>(
_compr.compressAscending(_concepts));
indexFile->write( (const char*)&byte, 1 ); // write k
_compr.write(*indexFile);
_compr.clear();
byte = static_cast<unsigned char>(_compr.minimize(_offsets, K));
indexFile->write( (const char*)&byte, 1 ); // write k
_compr.write(*indexFile);
indexFile->close();
delete indexFile;
}
short process(int documentNumber, std::vector<int> &concepts,
int n, short start, bool firstTime)
{
if (firstTime && concepts[start] >= _limit)
flush();
concepts[n] = _limit; // sentinel
while (concepts[start] < _limit)
{
_arrays[concepts[start++] - _minConcept].push_back(documentNumber);
}
return start;
}
private:
void heapify(int i)
{
int r = (i + 1) << 1, l = r - 1;
int smallest = l < _heapSize && _heap[l]->smallerThan(*_heap[i]) ? l : i;
if (r < _heapSize && _heap[r]->smallerThan(*_heap[smallest]))
smallest = r;
if (smallest != i)
{
MicroIndex *temp = _heap[smallest];
_heap[smallest] = _heap[i];
_heap[i] = temp;
heapify(smallest);
}
}
void flush()
{
for (int i = 0; i < MicroIndex::RANGE; ++i)
{
if (!_arrays[i].empty())
{
toDifferences(_arrays[i], _diffs);
unsigned char byte = static_cast<unsigned char>(
_compr.minimize(_diffs, K));
_mainFile->write( (const char*)&byte, 1 ); // write k
_offsets.push_back(_compr.byteCount() + 1);
_compr.write(*_mainFile);
_concepts.push_back(_minConcept + i);
_arrays[i].clear();
_diffs.clear();
_compr.clear();
}
}
_limit += MicroIndex::RANGE;
_minConcept += MicroIndex::RANGE;
}
};
int IndexInverter::K = 3;
MicroIndex::MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index)
: _concepts(NConcepts + 1), _data(positions), _decmp(NULL, 0)
{
_documentNumber = documentNumber;
_base = index;
openDocumentIndex();
}
bool MicroIndex::process(IndexInverter &lists)
{
bool firstTime = true;
while (true)
{
short stop = lists.process(_documentNumber, _concepts, _nc, _ix, firstTime);
if (stop < _nc)
{
_currentRange = _concepts[_ix = stop]/RANGE;
return true;
}
else if (next())
firstTime = false;
else
return false;
}
}
void Index::close()
{
/*
BtreeDictCompactor source = new BtreeDictCompactor(_dictParams, false);
URL url = new URL("file", "", _indexDir + "compacted");
BtreeDictParameters params =
new BtreeDictParameters(url, _dictParams.getBlockSize(), 0, _freeID);
source.compact(params);
URL tmapURL = new URL("file", "", _indexDir + "DICTIONARY");
File tmap = new File(tmapURL.getFile());
File compacted = new File(url.getFile());
compacted.renameTo(tmap);
_dictParams.setRoot(params.getRootPosition());
_dictParams.updateSchema();
*/
_dict->close(_freeID);
if (_positionsFile)
{
delete _positionsFile;
_positionsFile = NULL;
}
if (_update)
{
writeOutOffsets();
_dictParams->setFreeID(_freeID);
_dictParams->updateSchema();
_schema->save();
IndexInverter inverter(*this);
inverter.invertIndex(_documents.size(), _microIndexOffsets);
}
if (_offsetsFile)
{
delete _offsetsFile;
_offsetsFile = NULL;
}
}
void Index::init()
{
bool indexExists = false;
if (_update)
{
createIfNeeded();
_cache.clear();
}
if (_schema) delete _schema;
_schema = new Schema(*this, _update);
if (_dictParams) delete _dictParams;
_dictParams = new BtreeDictParameters(*_schema, "DICTIONARY");
if (_dictParams->readState() == false)
{
_dictParams->setBlockSize(2048);
_dictParams->setRoot(0);
_dictParams->setFreeID(1);
}
else
indexExists = true;
if (_dict) delete _dict;
_dict = new FullBtreeDict(*_dictParams, _update);
_freeID = _dictParams->getFreeID();
_documents.clear();
if (indexExists)
{
// read in index parts
_allLists = readByteArray("DOCS");
readDocumentsTable("DOCS.TAB");
readOffsetsTables("OFFSETS");
readPositions();
}
else
{
_microIndexOffsets.clear();
_titles.clear();
}
}
namespace
{
std::string cliptoken(const std::string &name)
{
std::string key = name;
int length = key.size();
while(key.size() >= 250)
key = name.substr(--length);
return key;
}
}
int Index::intern(const std::string &name)
{
std::string key = cliptoken(name);
IndexHashtable::const_iterator aIter = _cache.find(key);
if (aIter != _cache.end())
return aIter->second;
else
{
//Seeing as we always start off with an empty dictionary,
//our entries will always be in the _cache, so don't ever
//search the underlying dictionary
int id = _freeID++;
_dict->store(key, id);
_cache.insert(IndexHashtable::value_type(key, id)).first->second = id;
return id;
}
}
std::fstream& Index::getPositionsFile()
{
if (!_positionsFile)
_positionsFile = getRAF("POSITIONS", _update);
return *_positionsFile;
}
std::fstream& Index::getOffsetsFile()
{
if (!_offsetsFile)
_offsetsFile = getRAF("OFFSETS", _update);
return *_offsetsFile;
}
class VectorBtreeParameters : public BlockManagerParameters
{
private:
int _vectorLength;
public:
VectorBtreeParameters(Schema &schema, const std::string &partName) :
BlockManagerParameters(schema, partName)
{
_vectorLength = integerParameter("vl");
}
void updateSchema()
{
std::ostringstream tmp;
tmp << "vl=" << _vectorLength;
BlockManagerParameters::updateSchema(tmp.str());
}
VectorBtreeParameters(Schema &schema, const std::string &partName, int vecLen)
: BlockManagerParameters(schema, partName)
{
_vectorLength = vecLen;
}
int getVectorLength() { return _vectorLength; }
};
enum outerbreak { dobreak, docontinue, donothing };
class VectorProcessor
{
std::vector<unsigned char> _vector;
public:
virtual bool processVector() = 0;
std::vector<unsigned char>& getVectorBuffer() { return _vector; }
virtual ~VectorProcessor() {}
};
class VectorBlock;
class VectorBtree
{
protected:
VectorBlock *_root;
BlockManager *_blockManager;
VectorBtreeParameters *_params;
int _blockSize;
public:
int _maxEntries;
int _leafDataLimit;
protected:
int _vectorsOffset;
VectorBlock& accessBlock(int index);
VectorBtree() {/*empty*/}
public:
int _vecLen;
int vector(int index) const;
static int memcmp(const std::vector<unsigned char> &v1,
const std::vector<unsigned char> &v2, int i2, int n);
VectorBtree(VectorBtreeParameters *params);
~VectorBtree() { delete _blockManager; }
};
class VectorBlockFactory : public BlockFactory
{
private:
int _blockSize;
public:
VectorBlockFactory(int blockSize) : _blockSize(blockSize) {}
Block* makeBlock() const;
};
VectorBtree::VectorBtree(VectorBtreeParameters *params)
{
_params = params;
_vecLen = params->getVectorLength();
_blockSize = params->getBlockSize();
_maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN);
if ((_maxEntries & 1) == 0) // needs to be odd
_maxEntries--;
_leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN;
_vectorsOffset = (_maxEntries + 1)*Block::IDLEN;
_blockManager = new BlockManager(_params, false, new VectorBlockFactory(_blockSize));
_root = &(accessBlock(params->getRootPosition()));
}
VectorBlock& VectorBtree::accessBlock(int index)
{
return (VectorBlock&)_blockManager->accessBlock(index);
}
int VectorBtree::memcmp(const std::vector<unsigned char> &v1,
const std::vector<unsigned char> &v2, int i2, int n)
{
for (int i = 0; i < n; i++, i2++)
if (v1[i] != v2[i2])
return (v1[i]&0xFF) - (v2[i2]&0xFF);
return 0;
}
class VectorBlock : public Block
{
public:
VectorBlock(int size) : Block(size) {}
protected:
int findIndex(const std::vector<unsigned char> &key, const VectorBtree &tree)
{
int i = 0, j = _free - 1;
while (i <= j)
{
int k = (i + j)/2;
int test = VectorBtree::memcmp(key, _data, tree.vector(k),tree._vecLen);
// std::cerr << "k = " << k << ", test = " << test << std::endl;
if (test > 0)
i = k + 1;
else if (test < 0)
j = k - 1;
else
return -1 - k; // result always negative; "k" encoded
}
return i;
}
private:
int FindVectorsInLeaf(const std::vector<unsigned char> &lo,
const std::vector<unsigned char> &hi, int commLen, int prefLen,
std::vector<unsigned char> &buffer, int size, const VectorBtree &tree)
{
int idx = 0, start;
for (int nBytesEq = 0;;)
{
// std::cout << "idx = " << idx << std::endl;
if (_data[idx] == nBytesEq) // at compression byte
{
int i;
outerbreak hack(donothing);
for (i = nBytesEq; i < tree._vecLen; i++)
{
if (lo[i] == _data[++idx])
++nBytesEq;
else if ((lo[i]&0xFF) < (_data[idx]&0xFF))
if (nBytesEq >= commLen && (i >= prefLen || (hi[i]&0xFF) >= (_data[idx]&0xFF)))
{
start = nBytesEq;
hack = dobreak;
break;
}
else
return 0;
else
{
idx += tree._vecLen - i; // skip
hack = docontinue;
break;
}
}
if (hack == dobreak)
break;
else if (hack == docontinue)
continue;
if (i == tree._vecLen) // eq vec found
if ((_data[++idx]&0xFF) >= prefLen)
{
start = _data[idx++]&0xFF;
break;
}
else
return 0;
}
else if (_data[idx] < nBytesEq) // drop
{
std::cout << idx << std::endl;
nBytesEq = (_data[idx++]);
std::cout << nBytesEq << std::endl;
if (nBytesEq < commLen)
return 0;
else if (lo[nBytesEq] < (_data[idx]&0xFF))
if (hi[nBytesEq] < (_data[idx]&0xFF))
return 0;
else
{
start = nBytesEq; // found
break;
}
else
idx += tree._vecLen - nBytesEq;
}
else if ((_data[idx]&0xFF) == 0xFF)
return 0;
else // compression is bigger
idx += tree._vecLen + 1 - _data[idx];
}
int length = std::min(size - start, _free - idx);
buffer[0] = static_cast<unsigned char>(start);
memcpy(&(buffer[1]), &(_data[idx]), length);
buffer[length + 1] = 0;
return length + 1;
}
protected:
bool searchLeafBlock(const std::vector<unsigned char> &key, const VectorBtree &tree)
{
#if 0
processLeafBlock(_printer);
#endif
int nBytesEq = 0;
for (int idx = 0;; idx += tree._vecLen + 1 - _data[idx])
{
if (_data[idx] == nBytesEq)
{
int i, j;
outerbreak hack(donothing);
for (i = _data[idx], j = idx + 1; i < tree._vecLen; i++, j++)
{
if (key[i] == _data[j])
++nBytesEq;
else if ((key[i]&0xFF) < (_data[j]&0xFF))
return false;
else /* key[i] > _data[j] */
{
hack = dobreak;
break;
}
}
if (hack == dobreak)
break;
if (i == tree._vecLen) /* or nBytesEq == _vecLen */
return true; /* equal vector found */
}
else if (_data[idx] < nBytesEq)
return false;
}
return false;
}
public:
bool processLeafBlock(VectorProcessor &processor, const VectorBtree &tree)
{
std::vector<unsigned char> &buffer = processor.getVectorBuffer();
for (int ix = 0; ix < _free; ix += tree._vecLen - _data[ix] + 1)
{
// cmc: the below line was a comment in the original java, somewhere along
// the line I suspect this was written in c++, then into java
// and now I'm putting it back to c++ :-(
// ::memcpy(&buffer[_data[ix]], &_data[ix + 1], _vecLen - _data[ix]);
memcpy(&(buffer[_data[ix]]), &(_data[ix + 1]), tree._vecLen - _data[ix]);
if (processor.processVector())
return true;
}
return false;
}
}; // VectorBlock
Block* VectorBlockFactory::makeBlock() const
{
return new VectorBlock(_blockSize);
}
class FullVectorBlock : public VectorBlock
{
public:
FullVectorBlock(int size) : VectorBlock(size) {}
bool isFull(const VectorBtree &tree) const
{
//return pbl->_leaf ? pbl->_free > _leafDataLimit : pbl->_free == _maxEntries;
return _isLeaf ? _free > tree._leafDataLimit : _free == tree._maxEntries;
}
};
class FullVectorBtree : public VectorBtree
{
private:
static int MaxVeclen;
static double SplitRatio;
public:
FullVectorBtree(VectorBtreeParameters* params, bool update);
bool insertVector(const std::vector<unsigned char> &key);
private:
bool treeInsertNonfull(const FullVectorBlock &bl, const std::vector<unsigned char> &key);
bool treeInsertNonfullRoot(const std::vector<unsigned char> &key);
FullVectorBlock& getNewBlock();
void enableModif(const Block &bl);
void declareModif(const Block &bl);
public:
void close() { _blockManager->close(); }
};
int FullVectorBtree::MaxVeclen = 128;
double FullVectorBtree::SplitRatio = 0.5;
class FullVectorBlockFactory : public BlockFactory
{
private:
int _blockSize;
public:
FullVectorBlockFactory(int blockSize) : _blockSize(blockSize) {}
Block* makeBlock() const
{
return new FullVectorBlock(_blockSize);
}
};
FullVectorBtree::FullVectorBtree(VectorBtreeParameters *params, bool update)
{
_params = params;
_vecLen = params->getVectorLength();
_blockSize = params->getBlockSize();
_blockManager = new BlockManager(params, update, new FullVectorBlockFactory(_blockSize));
_maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN);
// System.out.println("_maxEntries = " + _maxEntries);
if ((_maxEntries & 1) == 0) // needs to be odd
_maxEntries--;
_leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN;
_vectorsOffset = (_maxEntries + 1)*Block::IDLEN;
_root = &(accessBlock(params->getRootPosition()));
}
class CompressorIterator
{
public:
virtual void value(int value) = 0;
virtual ~CompressorIterator() {}
};
int Decompressor::countZeroes()
{
for (int count = 0;; _readByte = getNextByte(), _toRead = BitsInByte)
{
HCDBG(fprintf(stderr, "count is %d\n", count));
HCDBG(fprintf(stderr, "Decompressor::countZeroes is %x\n", _readByte));
HCDBG(fprintf(stderr, "_toRead is %d\n", _toRead));
HCDBG(fprintf(stderr, "_readByte is %x\n", _readByte));
while (_toRead-- > 0)
{
if ((_readByte & (1 << _toRead)) != 0)
{
HCDBG(fprintf(stderr, "returning count of %d\n", count));
return count;
}
else
{
++count;
HCDBG(fprintf(stderr, "int count to %d\n", count));
}
}
}
//return 0;
}
// reads 1 bit; returns non-0 for bit "1"
int Decompressor::read()
{
if (_toRead-- > 0)
return _readByte & (1 << _toRead);
else
{ // get next word
_toRead = BitsInByte - 1;
return (_readByte = getNextByte()) & 0x80;
}
}
int Decompressor::read(int kBits)
{
int shift = BitsInByte - _toRead;
if (kBits <= _toRead)
{
HCDBG(fprintf(stderr, "leg 1\n"));
return ((_readByte<<shift) & 0xFF) >> (shift + (_toRead-=kBits));
}
else
{
HCDBG(fprintf(stderr, "leg 2 _readByte is %d, shift %d\n", _readByte, shift));
int result = _toRead > 0 ? ((_readByte << shift) & 0xFF) >> shift : 0;
HCDBG(fprintf(stderr, "result is %d\n", result));
for (kBits -= _toRead; kBits >= BitsInByte; kBits -= BitsInByte)
{
int foo = getNextByte();
HCDBG(fprintf(stderr, "byte is %d\n", foo));
result = (result << BitsInByte) | foo;
HCDBG(fprintf(stderr, "and result is %d\n", result));
}
if (kBits > 0)
{
int foo = getNextByte();
HCDBG(fprintf(stderr, "and byte is %d\n", foo));
int thing = BitsInByte - kBits;
HCDBG(fprintf(stderr, "thing is %d\n", thing));
_toRead = thing;
_readByte = foo;
int right = (_readByte >> _toRead);
HCDBG(fprintf(stderr, "right is %d\n", right));
int left = result << kBits;
HCDBG(fprintf(stderr, "kbits are %d\n", kBits));
HCDBG(fprintf(stderr, "left is %d\n", left));
int ret = left | right;
// int ret = (result << kBits) | ((_readByte = foo) >> (_toRead = BitsInByte - kBits));
HCDBG(fprintf(stderr, "and final is %d\n", ret));
return ret;
}
else
{
_toRead = 0;
HCDBG(fprintf(stderr, "and this result says %d\n", result));
return result;
}
}
}
bool Decompressor::readNext(int k, CompressorIterator &it)
{
if (read() != 0)
{
it.value(_path | read(k));
return true;
}
else
{
for (int count = 1;; _readByte = getNextByte(), _toRead = BitsInByte)
{
while (_toRead-- > 0)
{
if ((_readByte & (1 << _toRead)) != 0)
{
int saved = _path;
_path = ((_path >> (k + count) << count) | read(count)) << k;
if (_path != saved)
{
it.value(_path | read(k));
return true;
}
else
{
return false;
}
}
else
{
++count;
}
}
}
}
}
void Decompressor::decode(int k, IntegerArray &array)
{
for (int path = 0;;)
{
if (read() != 0)
{
array.push_back(path | read(k));
}
else
{
int count = countZeroes() + 1;
int saved = path;
path = ((path >> (k + count) << count) | read(count)) << k;
if (path != saved) // convention for end
array.push_back(path | read(k));
else
break;
}
}
}
void Decompressor::ascDecode(int k, IntegerArray &array)
{
for (int path = 0, start = 0;;)
{
HCDBG(fprintf(stderr, "path is %d, start is %d\n", path, start));
if (read() != 0)
{
int inread = read(k);
start += path | inread;
HCDBG(fprintf(stderr, "inread is %d\n", inread));
int final = start;
HCDBG(fprintf(stderr, "1:Decompressor::ascDecode to %d\n", final));
array.push_back(final);
}
else
{
int count = countZeroes() + 1;
HCDBG(fprintf(stderr, "count is %d\n", count));
int saved = path;
int inread = read(count);
HCDBG(fprintf(stderr, "inread is %d, k is %d, path is %d\n", inread,
k, path));
path = ((path >> (k + count) << count) | inread) << k;
if (path != saved) // convention for end
{
int anotherread = read(k);
HCDBG(fprintf(stderr, "newinread is %d\n", anotherread));
start += path | anotherread;
int final = start;
HCDBG(fprintf(stderr, "2:Decompressor::ascDecode to %d\n", final));
array.push_back(final);
}
else
{
break;
}
}
}
}
int Decompressor::ascendingDecode(int k, int start, std::vector<int> &array)
{
int path = 0, index = 0;
while (true)
{
if (read() != 0)
array[index++] = (start += path | read(k));
else
{
outerbreak hack = donothing;
for (int cnt = 0;; _readByte = getNextByte(), _toRead = BitsInByte)
{
while (_toRead-- > 0)
{
if ((_readByte & (1 << _toRead)) != 0)
{
++cnt;
int Path = ((path >> (k + cnt) << cnt) | read(cnt)) << k;
if (Path != path)
{
array[index++] = (start += (path = Path) | read(k));
hack = docontinue;
break;
}
else
return index;
}
else
++cnt;
}
if (hack == docontinue)
break;
}
}
}
}
class StreamDecompressor : public Decompressor
{
private:
std::ifstream *_input;
public:
StreamDecompressor(std::ifstream &input) { initReading(input); }
using Decompressor::initReading;
virtual void initReading(std::ifstream &input) { _input = &input; Decompressor::initReading(); }
int getNextByte()
{
unsigned char ret;
_input->read( (char*)&ret, 1 );
HCDBG(fprintf(stderr, "StreamDecompressor::getNextByte of %d\n", ret));
return ret;
}
};
void Index::readPositions()
{
getPositionsFile();
//!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
_positionsFile->seekg(0, std::ios::end);
_positionsCacheSize = _positionsFile->tellg();
if (_positionsCacheSize < 0) _positionsCacheSize = 0;
_positionsFile->clear();
_positionsFile->seekg(0, std::ios::beg);
if (_positionsCacheSize <= _positionsCacheSize)
{
_allInCache = true;
_positions.resize(_positionsCacheSize);
_positionsFile->readsome((char*)(&_positions[0]), _positionsCacheSize);
std::cout << "POS fits in cache" << std::endl;
}
}
void Index::readOffsetsTables(const std::string &fileName)
{
std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
unsigned char k1;
in.read( (char*)&k1, 1 );
StreamDecompressor sddocs(in);
sddocs.decode(k1, _documents);
unsigned char k2;
in.read( (char*)&k2, 1 );
_microIndexOffsets.clear();
StreamDecompressor sdoffsets(in);
sdoffsets.ascDecode(k2, _microIndexOffsets);
// decompress titles' ids table
unsigned char k3;
in.read( (char*)&k3, 1 );
_titles.clear();
StreamDecompressor sdtitles(in);
sdtitles.decode(k3, _titles);
}
void Index::readDocumentsTable(const std::string &fileName)
{
std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
unsigned char k1;
in.read( (char*)&k1, 1 );
_concepts.clear();
StreamDecompressor sddocs(in);
sddocs.ascDecode(k1, _concepts);
unsigned char k2;
in.read( (char*)&k2, 1 );
_offsets.clear();
_offsets.push_back(0);
StreamDecompressor sdoffsets(in);
sdoffsets.ascDecode(k2, _offsets);
in.close();
}
class ContextTables;
class Tables
{
private:
std::vector<int> _initialWordsCached;
std::vector<int> _destsCached;
std::vector<int> _linkTypesCached;
std::vector<int> _seqNumbersCached;
public:
Tables(const std::vector<int> &initialWords,
std::vector<int> &dests,
std::vector<int> &linkTypes,
std::vector<int> &seqNumbers)
{
_initialWordsCached = initialWords;
_destsCached = dests;
_linkTypesCached = linkTypes;
_seqNumbersCached = seqNumbers;
}
void setTables(ContextTables &context);
}; // end of Tables
class ContextTables
{
public:
std::vector<int> _initialWords;
std::vector<int> _dests;
std::vector<int> _linkTypes;
std::vector<int> _seqNumbers;
int _nTextNodes;
private:
std::vector<Tables*> _cache;
// cached last position for linear search
int _initialWordsIndex;
// link names are shared between all microindexes in an index
std::vector<std::string> _linkNames;
// offsets to tables' storage in file (or memory)
std::vector<int> _offsets;
std::vector<unsigned char> _contextData; // !!! fully cached for now
// auxillary
IntegerArray _kTable;
// _auxArray will be used as an auxillary to decode arrays
IntegerArray _auxArray;
int _lastDocNo;
std::vector<int> _markers;
public:
ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData,
const std::vector<std::string> &linkNames);
~ContextTables();
void setMicroindex(int docNo);
int parentContext(int context);
const std::string& linkName(int context);
int linkCode(const std::string &linkName);
std::vector<bool> getIgnoredElementsSet(const std::vector<std::string> &ignoredElements);
bool notIgnored(int ctx, const std::vector<bool> &ignoredElements);
int firstParentWithCode(int pos, int linkCode);
int firstParentWithCode2(int pos, int linkCode, int parentCode);
int firstParentWithCode3(int pos, int linkCode, int ancestorCode);
int firstParentWithCode4(int pos, const std::vector<int> &linkCodes);
int firstParentWithCode5(int pos, const std::vector<int> &pathCodes);
int firstParentWithCode7(int pos, int linkCode, int seq);
bool isGoverning(int context) { return linkName(context) == "TITLE"; }
void resetContextSearch() { _initialWordsIndex = 0; }
private:
void appendSegment(int context, std::string &result);
int findIndexBin(int wordNumber);
public:
int wordContextLin(int wordNumber);
};
ContextTables::ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData,
const std::vector<std::string> &linkNames) : _kTable(5), _auxArray(4096), _lastDocNo(-1)
{
_offsets = offsets;
_contextData = contextData;
_linkNames = linkNames;
_cache.resize(_offsets.size());
}
ContextTables::~ContextTables()
{
for (size_t i = 0; i < _cache.size(); ++i)
delete _cache[i];
}
void ContextTables::setMicroindex(int docNo)
{
if (docNo != _lastDocNo) // check if we need to do anything
{
if (_cache[docNo])
_cache[docNo]->setTables(*this);
else
{
int offset = _offsets[docNo];
int k0 = _contextData[offset] & 0xFF;
ByteArrayDecompressor compr(&_contextData, offset + 1);
_kTable.clear();
compr.decode(k0, _kTable);
// decompress initialWords into auxiliary array
_auxArray.clear();
compr.ascDecode(_kTable[0], _auxArray); // _initialWords
_initialWords = _auxArray;
_nTextNodes = _initialWords.size();
// decompress destinations into auxiliary array
_auxArray.clear();
compr.decode(_kTable[1], _auxArray); // _dests
_auxArray.push_back(-1); // sentinel, root
_dests = _auxArray;
_linkTypes.clear();
compr.decode(_kTable[2], _linkTypes);
_seqNumbers.clear();
compr.decode(_kTable[3], _seqNumbers);
_cache[docNo] = new Tables(_initialWords, _dests, _linkTypes, _seqNumbers);
/*
System.out.println("|_initialWords| = " + _nTextNodes);
System.out.println("|_dests| -1 = " + (_dests.length - 1));
System.out.println("|_seqNumbers| = " + _seqNumbers.length);
System.out.println("|_linkTypes| = " + _linkTypes.length);
*/
}
_lastDocNo = docNo;
_markers.resize(_dests.size());
}
_initialWordsIndex = 0;
}
int ContextTables::parentContext(int context)
{
return _dests[context];
}
const std::string& ContextTables::linkName(int context)
{
return _linkNames[_linkTypes[context]];
}
int ContextTables::linkCode(const std::string &inlinkName)
{
for (size_t i = 0; i < _linkNames.size(); i++)
if (inlinkName == _linkNames[i])
return i;
return -1; // when not found
}
std::vector<bool> ContextTables::getIgnoredElementsSet(const std::vector<std::string> &ignoredElements)
{
std::vector<bool> result;
bool noValidIgnoredElements = true;
if (!ignoredElements.empty())
{
result.resize(_linkNames.size());
for (size_t i = 0; i < ignoredElements.size(); i++)
{
int code = linkCode(ignoredElements[i]);
if (code > -1)
{
result[code] = true;
noValidIgnoredElements = false;
}
}
}
return noValidIgnoredElements ? std::vector<bool>() : result;
}
bool ContextTables::notIgnored(int ctx, const std::vector<bool> &ignoredElements)
{
do
{
if (ignoredElements[_linkTypes[ctx]])
{
std::cout << "hit ignored" << std::endl;
return false;
}
}
while ((ctx = _dests[ctx]) > -1); // parentContext 'hand inlined'
return true;
}
/** starting with ctx and going up the ancestry tree look for the first
context with the given linkCode */
int ContextTables::firstParentWithCode(int pos, int inlinkCode)
{
int ctx = _dests[wordContextLin(pos)]; // first parent of text node
int shift = _nTextNodes;
int limit = _dests.size() - 1;
while (_linkTypes[ctx - shift] != inlinkCode)
if ((ctx = _dests[ctx]) == limit)
return -1;
return ctx;
}
/** starting with ctx and going up the ancestry tree look for the first
context with the given linkCode and given parent code */
int ContextTables::firstParentWithCode2(int pos, int inlinkCode, int parentCode)
{
int ctx = _dests[wordContextLin(pos)]; // first parent of text node
int shift = _nTextNodes;
int limit = _dests.size() - 1;
for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent])
if (_linkTypes[parent - shift] == parentCode && _linkTypes[ctx - shift] == inlinkCode)
return ctx;
else
ctx = parent;
return -1;
}
/** starting with ctx and going up the ancestry tree look for the first
context with the given linkCode and given ancestor code */
int ContextTables::firstParentWithCode3(int pos, int inlinkCode, int ancestorCode)
{
int ctx = _dests[wordContextLin(pos)];
int shift = _nTextNodes;
int limit = _dests.size() - 1;
// find first instance of linkCode
while (ctx < limit && _linkTypes[ctx - shift] != inlinkCode)
ctx = _dests[ctx];
if (ctx < limit) // found linkCode, check ancestry
for (int ancestor = _dests[ctx];
ancestor < limit;
ancestor = _dests[ancestor])
if (_linkTypes[ancestor - shift] == ancestorCode) // ancestor confirmed
return ctx; // match found, return successful ctx
return -1; // match NOT found
}
/** starting with ctx and going up the ancestry tree look for the first
context with any of the given linkCode */
int ContextTables::firstParentWithCode4(int pos, const std::vector<int> &linkCodes)
{
int nCodes = linkCodes.size();
int shift = _nTextNodes;
int limit = _dests.size() - 1;
for (int ctx = _dests[wordContextLin(pos)]; ctx < limit; ctx = _dests[ctx])
{
int code = _linkTypes[ctx - shift];
for (int i = 0; i < nCodes; i++)
if (code == linkCodes[i])
return ctx;
}
return -1;
}
/** starting with ctx and going up the ancestry tree look for the first
context with the given path */
int ContextTables::firstParentWithCode5(int pos, const std::vector<int> &pathCodes)
{
int nCodes = pathCodes.size();
int lastCode = pathCodes[nCodes - 1];
int shift = _nTextNodes;
int limit = _dests.size() - 1;
int ctx = _dests[wordContextLin(pos)];
for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent])
{
if (_linkTypes[ctx - shift] == lastCode)
{
// try to match the entire path
outerbreak hack = donothing;
for (int i = nCodes - 2, parent2 = parent; i >= 0; i--)
if (_linkTypes[parent2 - shift] != pathCodes[i]) // match failure
{
hack = docontinue;
break; // try to match higher
}
else if ((parent2 = _dests[parent2]) == limit)
return -1;
if (hack == docontinue)
continue;
return ctx;
}
else
ctx = parent;
}
return -1;
}
/** starting with ctx and going up the ancestry tree look for the first
context with the given linkCode */
int ContextTables::firstParentWithCode7(int pos, int inlinkCode, int seq)
{
int ctx = _dests[wordContextLin(pos)]; // first parent of text node
int shift = _nTextNodes;
int limit = _dests.size() - 1;
while (_linkTypes[ctx - shift] != inlinkCode || _seqNumbers[ctx] != seq)
if ((ctx = _dests[ctx]) == limit)
return -1;
return ctx;
}
void ContextTables::appendSegment(int context, std::string &result)
{
result.append(context < _nTextNodes ? "text()" : _linkNames[_linkTypes[context - _nTextNodes]]);
result.push_back('[');
std::ostringstream tmp;
tmp << _seqNumbers[context];
result.append(tmp.str());
result.append("]/");
}
int ContextTables::findIndexBin(int wordNumber)
{
int i = 0, j = _nTextNodes - 1;
while (i <= j)
{
int k = (i + j) >> 1;
if (_initialWords[k] < wordNumber)
i = k + 1;
else if (_initialWords[k] > wordNumber)
j = k - 1;
else
return k;
}
return i - 1;
}
int ContextTables::wordContextLin(int wordNumber)
{
for (int i = _initialWordsIndex; i < _nTextNodes; i++)
if (_initialWords[i] > wordNumber) // first such i
{
// - 1 if wordNumbers can be the same
_initialWordsIndex = i; // cached to speed up next search
return i - 1;
}
return _nTextNodes - 1;
}
void Tables::setTables(ContextTables &context)
{
context._initialWords = _initialWordsCached;
context._dests = _destsCached;
context._linkTypes = _linkTypesCached;
context._seqNumbers = _seqNumbersCached;
context._nTextNodes = context._initialWords.size();
}
class Compressor;
class XmlIndex : public Index
{
private:
VectorBtreeParameters *_edgesParams;
FullVectorBtree *_edges;
ContextTables *_contextTables;
std::fstream *_contextsFile;
IntegerArray _contextsOffsets;
std::vector<unsigned char> _contextsData;
std::vector<std::string> _linkNames;
protected:
virtual void writeOutOffsets();
public:
XmlIndex(const fs::path &index, bool update)
: Index(index, update), _edgesParams(0), _edges(0), _contextTables(0), _contextsFile(0) {}
void init();
void close();
virtual ~XmlIndex() { delete _edgesParams; delete _edges; delete _contextTables; }
std::fstream& getContextsFile();
using Index::compress;
virtual void compress(int docID, int titleID,
std::vector<ConceptLocation> &locations,
std::vector<ConceptLocation> &extents,
int k, const Compressor &contextTables);
const std::vector<std::string>& getLinkNames() { return _linkNames; }
};
void XmlIndex::init()
{
Index::init();
if (_edgesParams) delete _edgesParams;
_edgesParams = new VectorBtreeParameters(*_schema, "EDGE", 9);
if (_edgesParams->readState() == false)
_edgesParams->setBlockSize(1024);
_edges = new FullVectorBtree(_edgesParams, _update);
if (!_contextsOffsets.empty())
{
_contextsData = readByteArray("CONTEXTS");
#if 0
_linkNames = (String[])readObject("LINKNAMES");
#endif
_contextTables = new ContextTables(_contextsOffsets, _contextsData, _linkNames);
}
}
void XmlIndex::writeOutOffsets()
{
Index::writeOutOffsets();
if (!_contextsOffsets.empty())
{
std::fstream &out = getOffsetsFile();
Compressor offsets2;
char k = static_cast<char>(offsets2.compressAscending(_contextsOffsets));
out.write( (const char*)&k, 1 );
offsets2.write(out);
}
}
std::fstream& XmlIndex::getContextsFile()
{
if (!_contextsFile)
_contextsFile = getRAF("CONTEXTS", _update);
return *_contextsFile;
}
void XmlIndex::close()
{
if (_contextsFile)
{
_contextsFile->close();
delete _contextsFile;
_contextsFile = 0;
}
_edges->close();
if (_update)
_edgesParams->updateSchema();
Index::close();
}
class Tokenizer
{
private:
UnicodeString s;
BreakIterator *bi;
int32_t start;
UConverter *utf8;
std::vector<char> utfbuffer;
public:
Tokenizer();
~Tokenizer();
void setText(const xmlChar *text);
std::string nextToken();
};
Tokenizer::Tokenizer() : start(BreakIterator::DONE), utfbuffer(64)
{
UErrorCode status = U_ZERO_ERROR;
bi = BreakIterator::createWordInstance("en_US", status);
utf8 = ucnv_open("utf-8", &status);
}
Tokenizer::~Tokenizer()
{
#if !defined(SOLARIS)
delete bi;
ucnv_close(utf8);
#endif
}
void Tokenizer::setText(const xmlChar *text)
{
UErrorCode status = U_ZERO_ERROR;
s = UnicodeString((const char*)text, -1, utf8, status);
bi->setText(s);
start = ubrk_first(bi);
}
std::string Tokenizer::nextToken()
{
std::string ret;
int32_t end = ubrk_next(bi);
while (end != BreakIterator::DONE)
{
if (ubrk_getRuleStatus(bi) != UBRK_WORD_NONE)
break;
start = end;
end = ubrk_next(bi);
}
if (end != -1 && end != start)
{
UnicodeString token(s, start, end-start);
token = token.toLower();
size_t needed = 0;
UErrorCode status = U_ZERO_ERROR;
while ((needed = token.extract(&utfbuffer[0], utfbuffer.size(), utf8, status)) > utfbuffer.size())
utfbuffer.resize(utfbuffer.size() * 2);
ret = std::string(&utfbuffer[0], needed);
start = end;
}
return ret;
}
typedef std::vector<xmlNodePtr> Vector;
ConceptLocation::ConceptLocation(int conceptID, int begin, int end) :
_concept(conceptID), _begin(begin), _end(end)
{
}
#ifdef EMULATEORIGINALSORT
class ConceptLocationSorter
{
public:
virtual bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) = 0;
private:
// part of quicksearch
int partition(std::vector<ConceptLocation> &array, int p, int r)
{
ConceptLocation x = array[(p + r)/2];
int i = p - 1, j = r + 1;
while (true)
{
while (smallerThan(x, array[--j]))
;
while (smallerThan(array[++i], x))
;
if (i < j)
{
ConceptLocation t = array[i];
array[i] = array[j];
array[j] = t;
}
else
return j;
}
}
public:
void quicksort(std::vector<ConceptLocation> &array, int p, int r)
{
while (p < r)
{
int q = partition(array, p, r);
quicksort(array, p, q);
p = q + 1;
}
}
};
class ConceptSorter : public ConceptLocationSorter
{
public:
bool smallerThan(const ConceptLocation &a, const ConceptLocation &b)
{
return a._concept < b._concept;
}
};
class PositionSorter : public ConceptLocationSorter
{
public:
bool smallerThan(const ConceptLocation &a, const ConceptLocation &b)
{
return a._begin < b._begin || a._begin == b._begin && a._end < b._end;
}
};
#else
class ConceptSorter
{
public:
bool operator()(const ConceptLocation &a, const ConceptLocation &b) const
{
return a._concept < b._concept;
}
};
class PositionSorter
{
public:
bool operator()(const ConceptLocation &a, const ConceptLocation &b) const
{
return a._begin < b._begin || (a._begin == b._begin && a._end < b._end);
}
};
#endif
void ConceptLocation::sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2)
{
#ifdef EMULATEORIGINALSORT
PositionSorter _pComp;
_pComp.quicksort(array, i1, i2 - 1);
#else
std::vector<ConceptLocation>::iterator begin = array.begin();
std::vector<ConceptLocation>::iterator end = begin;
std::advance(begin, i1);
std::advance(end, i2);
std::sort(begin, end, PositionSorter());
#endif
}
void ConceptLocation::sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2)
{
#ifdef EMULATEORIGINALSORT
ConceptSorter _cComp;
_cComp.quicksort(array, i1, i2 - 1);
#else
std::vector<ConceptLocation>::iterator begin = array.begin();
std::vector<ConceptLocation>::iterator end = begin;
std::advance(begin, i1);
std::advance(end, i2);
std::sort(begin, end, ConceptSorter());
#endif
}
typedef std::map<xmlNodePtr, int> NodeHashtable;
typedef std::hash_map<std::string, int, pref_hash> LinkHashTable;
class IndexAdapter
{
private:
static int StackSize;
const char* _indexText_Name;
const char* _indexElement_Name;
const char* _indexAttribute_Name;
const char* _nodeID_Name;
const char* _tokenizer_Name;
const char* _attributeName_Name;
std::vector<bool> _indexOnOffStack;
int _sp;
int _tsp;
std::vector< std::string > _attributeStack;
xmlNodePtr _currentNode;
int _attrSP;
void storeLocation(const std::string &token, int number);
void storeLocation(const std::string &token) { storeLocation(token, _lastWordNumber++); }
void storeEdge(int relation, int seqNumber, int destination);
void startElement(xmlNodePtr node);
void attribute(const char *name, const char *value);
void characters(const xmlChar *str) throw( HelpProcessingException );
void endElement(xmlNodePtr node);
void indexText(const xmlChar *str);
Vector _textNodes;
NodeHashtable _numberedNodes;
public:
HashSet _stoplist;
LinkHashTable _linkCodes;
std::vector<std::string> _linknames;
static int CurrenMaxLinkCode;
std::vector<ConceptLocation> _locations;
int _availContextNumber;
IntegerArray _initialWords;
IntegerArray _links;
IntegerArray _dests;
IntegerArray _seqNumbers;
int _lastWordNumber;
int _firstWord;
bool _anyLocationsStored;
XmlIndex *_index;
private:
static int InitSize;
int _size;
public:
IndexAdapter();
void process(xmlNodePtr node, xmlDocPtr doc);
void init();
void finish();
int intern(const std::string &name) { return _index->intern(name); }
int getLinkCode(const std::string &linkName);
};
int IndexAdapter::StackSize = 64;
int IndexAdapter::InitSize = 4096;
int IndexAdapter::CurrenMaxLinkCode = 0;
IndexAdapter::IndexAdapter()
: _indexOnOffStack(StackSize), _attributeStack(StackSize),
_anyLocationsStored(false), _size(InitSize)
{
_indexText_Name = "text";
_indexElement_Name = "element";
_indexAttribute_Name = "attribute";
_nodeID_Name = "nodeID";
_tokenizer_Name = "tokenizer";
_attributeName_Name = "attributeName";
}
void IndexAdapter::storeLocation(const std::string &token, int number)
{
int concept = intern(token);
HCDBG(std::cerr << "storeLocation of number " << number << "for token "
<< token << " as conceptlocation " << concept << std::endl);
_locations.push_back(ConceptLocation(concept, number, number));
}
void IndexAdapter::storeEdge(int relation, int seqNumber, int destination)
{
_links.push_back(relation);
_seqNumbers.push_back(seqNumber);
_dests.push_back(destination);
HCDBG(std::cerr << "storeEdge" << std::endl);
}
void IndexAdapter::finish()
{
_numberedNodes.clear();
_dests.clear();
_seqNumbers.clear();
_links.clear();
int nTextNodes = _textNodes.size();
_availContextNumber = nTextNodes;
// vector to hold parents of text nodes
Vector parents;
/*****
for each of the text nodes its sequence number is stored
as well as the index of its parent (in _dests)
_link is not stored as it is always "text()"
_availContextNumber only used to number parent element contexts
******/
for (int i = 0; i < nTextNodes; i++)
{
xmlNodePtr node = _textNodes[i];
xmlNodePtr parent = node->parent;
// find this text node's seq number
int counter = 1;
xmlNodePtr sibling = parent->xmlChildrenNode;
while (sibling && sibling != node)
{
if (xmlNodeIsText(sibling))
++counter;
sibling = sibling->next;
}
_seqNumbers.push_back(counter);
// check whether parent already encountered
NodeHashtable::const_iterator number = _numberedNodes.find(parent);
if (number == _numberedNodes.end()) // not yet seen
{
int newContext = _availContextNumber++;
_numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext;
_dests.push_back(newContext);
// enqueue parent: its parent will need a number too
parents.push_back(parent);
// System.out.println(parent.getName().toString() +
// " -> " + newContext);
}
else
{
_dests.push_back(number->second);
}
} // end for
_textNodes.clear();
// store info about element ancestry of the above text nodes
// grandparents are added to the end of the vector
int rootElementPos = 0;
for (size_t i = 0; i < parents.size(); i++)
{
xmlNodePtr node = parents[i];
std::string name((const char*)(node->name));
xmlNodePtr parent = node->parent;
_links.push_back(getLinkCode(name));
// if (parent.getType() == Node.ELEMENT) // not ROOT
if (parent && parent->parent) // not ROOT
{
// find sequence number
xmlNodePtr sibling = parent->xmlChildrenNode;
int counter = 1;
while (sibling && sibling != node)
{
if (strcmp((const char*)sibling->name, (const char*)name.c_str()) == 0)
++counter;
sibling = sibling->next;
}
_seqNumbers.push_back(counter);
// check whether parent already known
NodeHashtable::iterator number = _numberedNodes.find(parent);
if (number == _numberedNodes.end())
{
int newContext = _availContextNumber++;
_numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext;
_dests.push_back(newContext);
// enqueue parent: its parent will need a number too
parents.push_back(parent);
//System.out.println(parent.getName().toString() +
// " -> " + newContext);
}
else
{
_dests.push_back(number->second);
}
}
else
{
_dests.push_back(0); // placeholder
_seqNumbers.push_back(1);
rootElementPos = i + nTextNodes;
// System.out.println("rootElementPos = " + i);
}
} // end for
if (_dests.empty())
_dests.push_back(0);
// index to sentinel
_dests[rootElementPos] = _availContextNumber;
} // end public void finish
void IndexAdapter::init()
{
_sp = -1;
_tsp = -1;
_attrSP = -1;
_lastWordNumber = 0;
_anyLocationsStored = false;
_availContextNumber = 0;
// all the contexts' tables
_initialWords.clear();
_locations.clear();
}
void IndexAdapter::attribute(const char *name, const char *value)
{
HCDBG(std::cerr << "attribute: " << name << " = " << value << std::endl);
if (strcmp(name, _nodeID_Name) == 0)
_currentNode = (xmlNodePtr)(strtol(value, NULL, 10));
else if (strcmp(name, _tokenizer_Name) == 0)
{
if (strcmp(value, "com.sun.xmlsearch.util.SimpleTokenizer") != 0 && !isExtensionMode() )
std::cerr << "changing tokenizers not implemented in C++ version of HelpLinker"
<< " because no other tokenizers were referenced in the helpcontent2 source"
<< std::endl;
}
else if (strcmp(name, _attributeName_Name) == 0)
{
//namespace prefix ?
std::string attrVal = std::string("index:") + value;
if( !isExtensionMode() )
std::cout << "attrVal = " << attrVal << std::endl;
_attributeStack[_attrSP] = std::string(name) + '<' + value + '<' + attrVal;
storeLocation("+<" + _attributeStack[_attrSP]);
}
}
void IndexAdapter::indexText(const xmlChar *text)
{
static Tokenizer tokenizer;
tokenizer.setText(text);
_firstWord = _lastWordNumber;
_anyLocationsStored = false;
std::string lowercaseToken = tokenizer.nextToken();
while (!lowercaseToken.empty())
{
HCDBG(std::cerr << "token is: " << lowercaseToken << std::endl);
#ifdef EMULATEORIGINAL
if ((lowercaseToken.size() == 1) && isdigit(lowercaseToken[0]))
{
lowercaseToken = tokenizer.nextToken();
continue;
}
#endif
if (std::find(_stoplist.begin(),
_stoplist.end(), lowercaseToken) == _stoplist.end())
{
storeLocation(lowercaseToken);
_anyLocationsStored = true;
}
else
_lastWordNumber++;
lowercaseToken = tokenizer.nextToken();
}
if (_anyLocationsStored && _firstWord > -1)
{
_initialWords.push_back(_firstWord);
HCDBG(std::cerr << "appending " << _firstWord << std::endl);
_textNodes.push_back(_currentNode);
}
// reset before next batch
_firstWord = -1;
}
void IndexAdapter::characters(const xmlChar *str) throw( HelpProcessingException )
{
if (!str)
{
std::stringstream aStrStream;
aStrStream << "no characters!" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
HCDBG(std::cerr << "IndexAdapter::characters of " << str << std::endl);
HCDBG(std::cerr << _sp << " : " << _indexOnOffStack[_sp] << std::endl);
if (_sp >= 0 && _indexOnOffStack[_sp])
{
indexText( str );
}
}
void IndexAdapter::startElement(xmlNodePtr node)
{
const char *name = (const char*)(node->name);
HCDBG(std::cerr << "startElement is " << name << std::endl);
if (strcmp(name, _indexElement_Name) == 0)
{
_indexOnOffStack[++_sp] = true;
// pop Tokenizer stack
// following attribute can push selected Tokenizer
if (_tsp != -1)
_tsp--;
}
else if (strcmp(name, _indexText_Name) == 0)
{
}
else if (strcmp(name, _indexAttribute_Name) == 0)
{
_attrSP++;
}
}
void IndexAdapter::endElement(xmlNodePtr node)
{
const char *name = (const char*)(node->name);
HCDBG(std::cerr << "endElement is " << name << std::endl);
if (strcmp(name, _indexElement_Name) == 0)
_sp--;
else if (strcmp(name, _indexText_Name) == 0)
{
// reset
}
else if (strcmp(name, _indexAttribute_Name) == 0)
storeLocation("-<" + _attributeStack[_attrSP--]);
}
int IndexAdapter::getLinkCode(const std::string &linkName)
{
LinkHashTable::iterator code = _linkCodes.find(linkName);
if (code != _linkCodes.end())
return code->second;
else
{
_linknames.push_back(linkName);
int newCode = CurrenMaxLinkCode++;
_linkCodes.insert(LinkHashTable::value_type(linkName, newCode)).first->second = newCode;
return newCode;
}
}
void IndexAdapter::process(xmlNodePtr node, xmlDocPtr doc)
{
startElement(node);
for (xmlAttrPtr attr = node->properties; attr; attr = attr->next)
{
xmlChar *value = xmlNodeListGetString(doc, attr->children, 0);
attribute((const char*)(attr->name), (const char*)value);
xmlFree(value);
}
if (xmlNodeIsText(node))
{
xmlChar *str = xmlNodeListGetString(doc, node, 1);
characters(str);
xmlFree(str);
}
for (xmlNodePtr test = node->xmlChildrenNode; test; test = test->next)
process(test, doc);
endElement(node);
}
class XmlIndexBuilder
{
private:
fs::path _transformLocation;
xsltStylesheetPtr _indexingTransform;
IndexAdapter _indexAdapter;
int _currentDocID;
void reset();
xsltStylesheetPtr getTransform(const std::string &stylesheetName);
public:
XmlIndexBuilder() : _indexingTransform(0) {}
XmlIndexBuilder(const fs::path &dir);
~XmlIndexBuilder();
void clearIndex();
void setTransformLocation(const fs::path &filelocation);
void init(const std::string &transform);
void initXmlProcessor(const std::string &transform);
void indexDocument(xmlDocPtr document, const std::string &docURL, const std::string &title);
int intern(const std::string &name);
void openDocument(const std::string &name) throw( HelpProcessingException );
void closeDocument(const std::string &name) throw( HelpProcessingException );
void close();
};
void XmlIndexBuilder::close()
{
fs::path fullname = _indexAdapter._index->indexFile("LINKNAMES");
std::fstream _linkFile(fullname.native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
#ifdef EMULATEORIGINAL
static const unsigned char vectorheader[] =
{
0xAC, 0xED, 0x00, 0x05, 0x75, 0x72, 0x00, 0x13,
0x5B, 0x4C, 0x6A, 0x61, 0x76, 0x61, 0x2E, 0x6C,
0x61, 0x6E, 0x67, 0x2E, 0x53, 0x74, 0x72, 0x69,
0x6E, 0x67, 0x3B, 0xAD, 0xD2, 0x56, 0xE7, 0xE9,
0x1D, 0x7B, 0x47, 0x02, 0x00, 0x00, 0x78, 0x70
};
_linkFile.write((const char*)(&vectorheader[0]), sizeof(vectorheader));
writeInt(_linkFile, _indexAdapter._linknames.size());
std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end();
for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin();
aIter != aEnd; ++aIter)
{
HCDBG(std::cerr << "linkname is " << *aIter << std::endl);
_linkFile << 't';
writeShort(_linkFile, aIter->size());
_linkFile << *aIter;
}
#else
std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end();
for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin();
aIter != aEnd; ++aIter)
{
_linkFile << *aIter << '\n';
}
#endif
#if 0
// output link codes
/*
Enumeration keys = _linknames.elements();
while (keys.hasMoreElements())
System.out.println((String)keys.nextElement());
*/
#endif
_indexAdapter._index->close();
std::cout << "done" << std::endl;
}
int XmlIndexBuilder::intern(const std::string &name)
{
return _indexAdapter.intern(name);
}
void XmlIndexBuilder::openDocument(const std::string &name) throw( HelpProcessingException )
{
if (_currentDocID != 0)
{
std::stringstream aStrStream;
aStrStream << "document already open" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
_currentDocID = intern( PrefixTranslator::translatePrefix(name) );
reset(); // reset context gathering state
}
int BitBuffer::InitSize = 256;
int BitBuffer::NBits = 32;
int BitBuffer::BitsInByte = 8;
int BitBuffer::BytesInInt = 4;
void Compressor::encode(const IntegerArray &pos, int k)
{
HCDBG(std::cerr << "1:start this encode of " << k << "size of "
<< pos.size() << std::endl);
unsigned int n1 = 0;
unsigned int power = 1 << k;
for (size_t i = 0; i < pos.size(); i++)
{
HCDBG(std::cerr << "1: loop " << i << std::endl);
unsigned int n2 = pos[i] >> k;
int rem = pos[i] % power;
HCDBG(std::cerr << "1: n1, n2 : " << n1 << "," << n2 << std::endl);
if (n2 != n1)
{
unsigned int min = n1;
unsigned int a = n1;
int lev = 0, power2 = 1;
if (n2 > n1)
for (size_t max = n1; max < n2; a >>= 1, power2 <<= 1, lev++)
if ((a & 1) != 0)
min -= power2;
else
max += power2;
else
for ( ; min > n2; a >>= 1, power2 <<= 1, lev++)
if ((a & 1) != 0)
min -= power2;
// lev 0s, 1, lev bits of (n2 - min) plus following value
// no 'V' symbol needed here
if (lev*2 + 1 + k <= NBits)
_buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k);
else
{
if (lev*2 + 1 <= NBits)
_buffer.append(1 << lev | (n2 - min), lev*2 + 1);
else
{
_buffer.append(0, lev);
_buffer.append(1 << lev | (n2 - min), lev + 1);
}
_buffer.append(rem, k);
}
n1 = n2;
}
else
_buffer.append(rem | power, k + 1); // 'V' + value
}
_buffer.append(2 | n1 & 1, 3); // marking end
_buffer.close();
HCDBG(std::cerr << "1:end this encode of " << k << std::endl);
}
void Compressor::encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2)
{
HCDBG(std::cerr << "2:start this encode of " << k << "size of "
<< pos.size() << std::endl);
int power = 1 << k, n1 = 0;
for (size_t i = 0; i < pos.size(); i++)
{
HCDBG(std::cerr << "2: loop " << i << std::endl);
int n2 = pos[i] >> k;
int rem = pos[i] % power;
HCDBG(std::cerr << "2: n1, n2 : " << n1 << "," << n2 << std::endl);
if (n2 != n1)
{
int min = n1, a = n1;
int lev = 0, power2 = 1;
if (n2 > n1)
for (int max = n1; max < n2; a >>= 1, power2 <<= 1, lev++)
if ((a & 1) != 0)
min -= power2;
else
max += power2;
else
for ( ; min > n2; a >>= 1, power2 <<= 1, lev++)
if ((a & 1) != 0)
min -= power2;
// lev 0s, 1, lev bits of (n2 - min) plus following value
if (lev*2 + 1 + k <= NBits)
_buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k);
else
{
if (lev*2 + 1 <= NBits)
_buffer.append(1 << lev | (n2 - min), lev*2 + 1);
else
{
_buffer.append(0, lev);
_buffer.append(1 << lev | (n2 - min), lev + 1);
}
_buffer.append(rem, k);
}
_buffer.append(len[i], k2);
n1 = n2;
}
else
_buffer.append((rem|power)<<k2 | len[i], k+k2+1); // 'V' + v1,v2
}
_buffer.append(2 | n1 & 1, 3); // marking end
_buffer.close();
HCDBG(std::cerr << "2:end this encode of " << k << std::endl);
}
// k: starting value for minimization
int Compressor::minimize(const IntegerArray &array, int startK)
{
BitBuffer saved;
int minK = startK;
_buffer.clear();
encode(array, startK);
int min = _buffer.bitCount(); // init w/ first value
saved.setFrom(_buffer);
_buffer.clear();
encode(array, startK + 1);
if (_buffer.bitCount() < min)
{
int k = startK + 1;
do
{
saved.setFrom(_buffer);
min = _buffer.bitCount();
minK = k;
_buffer.clear();
encode(array, ++k);
}
while (_buffer.bitCount() < min);
}
else // try smaller values through 1
{
for (int k = startK - 1; k > 0; k--)
{
_buffer.clear();
encode(array, k);
if (_buffer.bitCount() < min)
{
saved.setFrom(_buffer);
min = _buffer.bitCount();
minK = k;
}
else
break;
}
}
_buffer.setFrom(saved);
return minK;
}
int Compressor::compressAscending(const IntegerArray &array)
{
IntegerArray differences(array.size());
toDifferences(array, differences);
return minimize(differences, BeginK);
}
int Compressor::NBits = 32;
int Compressor::BeginK = 5;
class DocumentCompressor
{
public:
static int NConceptsInGroup;
static int BitsInLabel;
static int DefaultSize;
private:
int _nGroups;
int _nExtents;
unsigned int _freeComp;
int _kk;
Compressor *_currentCompressor;
std::vector<Compressor> _compressors;
Compressor _kCompr;
Compressor _lCompr;
Compressor _mCompr;
Compressor _posCompressor;
IntegerArray _kTable; // k's for the series
IntegerArray _lTable; // lengths of the C/P groups
IntegerArray _maxConcepts; // maximal concepts in CP
IntegerArray _concepts;
IntegerArray _documents;
IntegerArray _microIndexOffsets;
IntegerArray _titles;
// _contextsOffsets for use in XML indexing
IntegerArray _contextsOffsets;
IntegerArray _positions;
IntegerArray _labels;
public:
DocumentCompressor() : _currentCompressor(0), _compressors(DefaultSize) {}
void writeOutMicroIndex(std::fstream &output,
std::vector<ConceptLocation> &locations,
std::vector<ConceptLocation> &extents)
{
HCDBG(std::cerr << "writeOutMicroIndex start" << std::endl);
encode(locations, NConceptsInGroup);
HCDBG(std::cerr << "writeOutMicroIndex end encode" << std::endl);
if (!extents.empty())
encodeExtents(extents);
HCDBG(std::cerr << "writeOutMicroIndex finalize" << std::endl);
finalizeEncoding();
HCDBG(std::cerr << "writeOutMicroIndex write" << std::endl);
writeOut(output);
HCDBG(std::cerr << "writeOutMicroIndex end" << std::endl);
}
private:
void encode(std::vector<ConceptLocation> &locations, int nConcepts)
{
int initK = 4;
// first sort by concept only
#ifdef CMCDEBUG
for (size_t i = 0; i < locations.size(); ++i)
fprintf(stderr, "unsorted is %d\n", locations[i].getConcept());
#endif
HCDBG(std::cerr << "start sort" << std::endl);
ConceptLocation::sortByConcept(locations, 0, locations.size());
HCDBG(std::cerr << "end sort" << std::endl);
#ifdef CMCDEBUG
for (size_t i = 0; i < locations.size(); ++i)
fprintf(stderr, "sorted is %d\n", locations[i].getConcept());
#endif
// using the fact that concepts are already sorted
// count of groups of 'nConcepts'
// go for differences directly
// clear the state
_nGroups = 0;
_nExtents = 0;
_kTable.clear();
_lTable.clear();
_concepts.clear();
_maxConcepts.clear();
_kCompr.clear();
_lCompr.clear();
_mCompr.clear();
for (size_t i = 0; i < _compressors.size(); i++)
_compressors[i].clear();
_freeComp = 0;
_currentCompressor = NULL;
// end of resetting state
int conceptCounter = 0;
int fromIndex = 0;
int prevMax = 0;
int last = locations[0].getConcept(); // init w/ first ID
nextCompressor();
_concepts.push_back(last);
for (size_t i = 0;;)
{
for (; i < locations.size() && locations[i].getConcept() == last; i++)
locations[i].setConcept(conceptCounter);
if (i == locations.size())
{
if (!_concepts.empty())
{
++_nGroups;
_kTable.push_back(_currentCompressor->minimize(_concepts, initK));
}
encodePositions(locations, fromIndex, i, BitsInLabel);
break;
}
else
{ // new concept (group?)
if (++conceptCounter == nConcepts)
{
++_nGroups;
// we are looking at the beginning of a new group
// last is maximal for the group just finished
// it won't be stored in concepts array but maxConcepts
_concepts.pop_back();
HCDBG(fprintf(stderr, "_maxConcepts %d %d -> %d\n", last, prevMax, last - prevMax));
_maxConcepts.push_back(last - prevMax);
prevMax = last;
_kTable.push_back(_currentCompressor->minimize(_concepts, initK));
#ifdef CMCDEBUG
for(size_t p = 0; p < locations.size(); ++p)
std::cerr << "microindex2 this testing is " << locations[p].getBegin() <<
locations[p].getEnd() << " : " << locations[p].getConcept() << std::endl;
#endif
HCDBG(std::cerr << "two encodePositions " << fromIndex << " " << i << std::endl);
encodePositions(locations, fromIndex, i, BitsInLabel);
fromIndex = i;
nextCompressor();
_concepts.clear();
conceptCounter = 0;
}
_concepts.push_back(locations[i].getConcept() - last);
last = locations[i].getConcept();
}
}
}
void encodePositions(std::vector<ConceptLocation> &locations, int from, int to, int cK)
{
int initK = 3;
int lastPos, k;
// sort in place by psitions only
#ifdef CMCDEBUG
for (int i = from; i < to; ++i)
fprintf(stderr, "unsorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd());
#endif
ConceptLocation::sortByPosition(locations, from, to);
#ifdef CMCDEBUG
for (int i = from; i < to; ++i)
fprintf(stderr, "sorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd());
#endif
_positions.clear();
_labels.clear();
_positions.push_back(lastPos = locations[from].getBegin());
_labels.push_back(locations[from].getConcept()); // now: a label
// skip duplicates
for (int i = from, j = from + 1; j < to; j++)
{
if (locations[i].equals(locations[j]) == false)
{
i = j;
HCDBG(std::cerr << "i is " << i << "locations begin is "
<< locations[i].getBegin() << "last pos is " << lastPos << std::endl);
_positions.push_back(locations[i].getBegin() - lastPos);
lastPos = locations[i].getBegin();
_labels.push_back(locations[i].getConcept()); // now: a label
}
}
// first find k by minimizing just positions w/o labels
_kTable.push_back(k = _posCompressor.minimize(_positions, initK));
_posCompressor.clear();
HCDBG(std::cerr << "start encodePositions" << std::endl);
_posCompressor.encode(_positions, _labels, k, cK);
HCDBG(std::cerr << "end encodePositions" << std::endl);
_currentCompressor->concatenate(_posCompressor);
}
void encodeExtents(std::vector<ConceptLocation> &extents)
{
// side effects:
// 'k3' added to _kTable
// a number of compressors populated: header + lengths' lists
int initK = 4;
int c = 0;
IntegerArray concepts; //difference
IntegerArray lengths;
IntegerArray kTable;
IntegerArray lTable;
// reserve a compressor for concatenated tables
nextCompressor();
Compressor *extentsHeader = _currentCompressor;
std::vector<ConceptLocation>::const_iterator aEnd = extents.end();
for (std::vector<ConceptLocation>::const_iterator aIter = extents.begin();
aIter != aEnd; ++aIter)
{
if (aIter->getConcept() != c)
{
if (c != 0)
{
_nExtents++;
nextCompressor();
kTable.push_back(_currentCompressor->minimize(lengths, initK));
lTable.push_back(_currentCompressor->byteCount());
}
concepts.push_back(aIter->getConcept() - c);
c = aIter->getConcept();
lengths.clear();
lengths.push_back(aIter->getLength());
}
else
lengths.push_back(aIter->getLength());
}
// last table of lengths
nextCompressor();
kTable.push_back(_currentCompressor->minimize(lengths, initK));
lTable.push_back(_currentCompressor->byteCount());
Compressor compressor1;
kTable.push_back(compressor1.minimize(lTable, initK));
Compressor compressor2;
kTable.push_back(compressor2.minimize(concepts, initK));
_kTable.push_back(extentsHeader->minimize(kTable, initK)); // k3
extentsHeader->concatenate(compressor1);
extentsHeader->concatenate(compressor2);
}
void finalizeEncoding()
{
if (_nGroups > 1)
{
// if extents follow C/P groups we need the length of the last group
int limit = _nExtents > 0 ? _freeComp : _freeComp - 1;
for (int j = 0; j < limit; j++) // length of last not saved
_lTable.push_back(_compressors[j].byteCount());
_kTable.push_back(_mCompr.minimize(_maxConcepts, 3));
_kTable.push_back(_lCompr.minimize(_lTable, 3));
_kk = _kCompr.minimize(_kTable, 3);
_kCompr.concatenate(_lCompr);
_kCompr.concatenate(_mCompr);
}
else if (_nGroups == 1 && _nExtents > 0)
{
// length of the single C/P group packed with k-s
_kTable.push_back(_compressors[0].byteCount());
_kk = _kCompr.minimize(_kTable, 3);
}
}
void writeOut(std::fstream &out)
{
if (_nExtents == 0)
{
if (_nGroups > 1)
{
unsigned char byte = static_cast<unsigned char>((0x80 | _kk));
out.write( (const char*)&byte, 1 );
HCDBG(std::cerr << "writeOut of " << int(byte) << std::endl);
_kCompr.write(out); // concatenated k,l,m
for (size_t j = 0; j < _freeComp; j++)
_compressors[j].write(out);
}
else // single group, no extents; code: 00
{
unsigned char k1 = (unsigned char)(_kTable[0]);
unsigned char k2 = (unsigned char)(_kTable[1]);
out.write( (const char*)&k1, 1 );
out.write( (const char*)&k2, 1 );
_compressors[0].write(out); // C/P
}
}
else
{ // extents
unsigned char byte = static_cast<unsigned char>(
(_nGroups > 1 ? 0xC0 : 0x40) | _kk);
out.write( (const char*)&byte, 1 );
_kCompr.write(out);
for (size_t j = 0; j < _freeComp; j++)
_compressors[j].write(out);
}
}
Compressor* nextCompressor()
{
if (_freeComp == _compressors.size())
_compressors.push_back(Compressor());
return _currentCompressor = &_compressors[_freeComp++];
}
int byteCount()
{
if (_nGroups == 1 && _nExtents == 0)
return 2 + _compressors[0].byteCount();
else
{
int result = 1; // initial kk
result += _kCompr.byteCount();
for (size_t j = 0; j < _freeComp; j++)
result += _compressors[j].byteCount();
return result;
}
}
};
int DocumentCompressor::NConceptsInGroup = 16;
int DocumentCompressor::BitsInLabel = 4;
int DocumentCompressor::DefaultSize = 32;
DocumentCompressor& Index::getDocumentCompressor()
{
if (!_documentCompressor)
_documentCompressor = new DocumentCompressor();
return *_documentCompressor;
}
void Index::compress(int docID, int titleID,
std::vector<ConceptLocation> &locations,
std::vector<ConceptLocation> &extents)
{
std::fstream &positions = getPositionsFile();
positions.seekg(0, std::ios::end);
long currentEnd = positions.tellg();
if (currentEnd < 0) currentEnd = 0;
positions.clear();
positions.seekg(currentEnd, std::ios::beg);
_documents.push_back(docID);
_microIndexOffsets.push_back(currentEnd);
HCDBG(std::cerr << "_microIndexOffsets pushed back " << currentEnd << std::endl);
HCDBG(std::cerr << "added title id of " << titleID << std::endl);
_titles.push_back(titleID);
getDocumentCompressor().writeOutMicroIndex(positions,
locations, extents);
}
void Index::writeOutOffsets()
{
Compressor documents;
int k1 = documents.minimize(_documents, 8);
Compressor offsets;
int k2 = offsets.compressAscending(_microIndexOffsets);
Compressor titles;
int k3 = titles.minimize(_titles, 8); // 8 is the starting k
std::fstream &out = getOffsetsFile();
out.seekp(0); // position at beginning
out.clear();
unsigned char byte;
byte = static_cast<unsigned char>(k1);
out.write( (const char*)&byte, 1 );
HCDBG(fprintf(stderr, "a: offset dump of %x\n", byte));
documents.write(out);
byte = static_cast<unsigned char>(k2);
out.write( (const char*)&byte, 1 );
HCDBG(fprintf(stderr, "b: offset dump of %x\n", byte));
offsets.write(out);
byte = static_cast<unsigned char>(k3);
out.write( (const char*)&byte, 1 );
HCDBG(fprintf(stderr, "c: offset dump of %x\n", byte));
titles.write(out);
}
Index::~Index()
{
delete _schema;
delete _dictParams;
delete _dict;
delete _positionsFile;
delete _offsetsFile;
delete _documentCompressor;
}
void XmlIndex::compress(int docID, int titleID,
std::vector<ConceptLocation> &locations,
std::vector<ConceptLocation> &extents,
int k, const Compressor &contextTables)
{
HCDBG(std::cerr << "start compress" << std::endl);
HCDBG(std::cerr << "docID : " << docID << " titleID : " << titleID <<
"locations size : " << locations.size() << "extents size : " << extents.size() << std::endl);
Index::compress(docID, titleID, locations, extents);
HCDBG(std::cerr << "end compress" << std::endl);
std::fstream& contexts = getContextsFile();
contexts.seekp(0, std::ios::end);
long currentEnd = contexts.tellp();
if (currentEnd < 0) currentEnd = 0;
contexts.clear();
contexts.seekp(currentEnd);
writeByte(contexts, static_cast<unsigned char>(k));
contextTables.write(contexts);
_contextsOffsets.push_back(currentEnd);
}
void XmlIndexBuilder::closeDocument(const std::string &title) throw( HelpProcessingException )
{
if (_currentDocID == 0)
{
std::stringstream aStrStream;
aStrStream << "no document open" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
else if (!_indexAdapter._locations.empty())
{
IntegerArray kTable;
Compressor compressor1;
Compressor compressor2;
Compressor compressor3;
Compressor compressor4;
kTable.push_back(compressor1.compressAscending(_indexAdapter._initialWords));
kTable.push_back(compressor2.minimize(_indexAdapter._dests, 2));
kTable.push_back(compressor3.minimize(_indexAdapter._links, 2));
kTable.push_back(compressor4.minimize(_indexAdapter._seqNumbers, 2));
Compressor compressor0;
int k0 = compressor0.minimize(kTable, 4);
compressor0.concatenate(compressor1);
compressor0.concatenate(compressor2);
compressor0.concatenate(compressor3);
compressor0.concatenate(compressor4);
std::vector<ConceptLocation> dummy;
_indexAdapter._index->compress(_currentDocID, intern(title),
_indexAdapter._locations, dummy, k0, compressor0);
}
else
{
// System.out.println("no indexable content");
}
_indexAdapter._locations.clear();
_currentDocID = 0; // state: nothing open
}
void XmlIndexBuilder::indexDocument(xmlDocPtr doc, const std::string &docURL, const std::string &title)
{
HCDBG(std::cerr << "Indexing " << docURL << std::endl);
xmlNodePtr root = xmlDocGetRootElement(doc);
openDocument(docURL);
// xmlDocDump(stdout, doc);
xmlDocPtr res = xsltApplyStylesheet(_indexingTransform, doc, NULL);
_indexAdapter.init();
// start = System.currentTimeMillis();
root = xmlDocGetRootElement(res);
if (root)
{
// xmlDocDump(stdout, res);
for (xmlNodePtr test = root; test; test = test->next)
_indexAdapter.process(test, res);
}
xmlFreeDoc(res);
// System.out.println((System.currentTimeMillis()-start)+" transform");
// start = System.currentTimeMillis();
_indexAdapter.finish();
// System.out.println((System.currentTimeMillis()-start)+" finish");
// start = System.currentTimeMillis();
closeDocument(title);
// System.out.println((System.currentTimeMillis()-start)+" close");
}
XmlIndexBuilder::~XmlIndexBuilder()
{
delete _indexAdapter._index;
}
void XmlIndexBuilder::setTransformLocation(const fs::path &filelocation)
{
_transformLocation = filelocation;
}
xsltStylesheetPtr XmlIndexBuilder::getTransform(const std::string &stylesheetName)
{
fs::path stylesheet = _transformLocation / (stylesheetName + ".xsl");
return xsltParseStylesheetFile((const xmlChar *)stylesheet.native_file_string().c_str());
}
void XmlIndexBuilder::initXmlProcessor(const std::string &transform)
{
_indexingTransform = getTransform(transform);
}
void XmlIndexBuilder::init(const std::string &transform)
{
_indexAdapter._index->init();
#ifdef EMULATEORIGINAL
//some kind of bug in the original AFAICS
_indexAdapter._stoplist.push_back("andnull");
#endif
reset();
// initialize vector and hashtable
const std::vector<std::string> &linkNames = _indexAdapter._index->getLinkNames();
std::vector<std::string>::const_iterator aEnd = linkNames.end();
for (std::vector<std::string>::const_iterator aIter = linkNames.begin();
aIter != aEnd; ++aIter)
{
_indexAdapter.getLinkCode(*aIter);
}
initXmlProcessor(transform);
}
void XmlIndexBuilder::reset()
{
_indexAdapter._availContextNumber = 0;
_indexAdapter._lastWordNumber = 0;
_indexAdapter._locations.clear();
_indexAdapter._anyLocationsStored = false;
// all the contexts' tables
_indexAdapter._initialWords.clear();
_indexAdapter._dests.clear();
_indexAdapter._links.clear();
_indexAdapter._seqNumbers.clear();
}
XmlIndexBuilder::XmlIndexBuilder(const fs::path &indexDir)
: _indexingTransform(0), _currentDocID(0)
{
HCDBG(std::cerr << "indexDir is " << indexDir.native_directory_string() << std::endl);
_indexAdapter._index = new XmlIndex(indexDir, true);
}
void XmlIndexBuilder::clearIndex()
{
_indexAdapter._index->clear();
}
class HelpLinker
{
public:
static void main(std::vector<std::string> &args, std::string* pExtensionPath = NULL )
throw( HelpProcessingException );
static bool isExtensionMode( void )
{return bExtensionMode; }
private:
HelpLinker() : init(true), xmlIndexBuilder(NULL) {}
~HelpLinker() { delete xmlIndexBuilder; }
JarOutputStream jarOutputStream;
static int locCount, totCount;
static Stringtable additionalFiles;
static HashSet helpFiles;
static fs::path sourceRoot;
static fs::path embeddStylesheet;
static fs::path indexStylesheet;
static fs::path outputFile;
static std::string module;
static std::string lang;
static std::string hid;
static std::string extensionPath;
static bool bExtensionMode;
fs::path indexDirName;
Stringtable hidlistTranslation;
fs::path indexDirParentName;
bool init;
XmlIndexBuilder* xmlIndexBuilder;
void initXMLIndexBuilder();
void createFileFromBytes(const std::string &fileName,
const std::string &defaultXSL);
void closeXMLIndexBuilder()
{
xmlIndexBuilder->close();
}
void link() throw( HelpProcessingException );
void addBookmark( DB* dbBase, std::string thishid,
const std::string& fileB, const std::string& anchorB,
const std::string& jarfileB, const std::string& titleB );
#if 0
/**
* @param outputFile
* @param module
* @param lang
* @param hid
* @param helpFiles
* @param additionalFiles
*/
private HelpURLStreamHandlerFactory urlHandler = null;
#endif
};
bool isExtensionMode( void )
{
return HelpLinker::isExtensionMode();
}
namespace URLEncoder
{
static std::string encode(const std::string &rIn)
{
const char *good = "!$&'()*+,-.=@_";
static const char hex[17] = "0123456789ABCDEF";
std::string result;
for (size_t i=0; i < rIn.length(); ++i)
{
unsigned char c = rIn[i];
if (isalnum (c) || strchr (good, c))
result += c;
else {
result += '%';
result += hex[c >> 4];
result += hex[c & 0xf];
}
}
return result;
}
}
JarOutputStream::JarOutputStream()
{
perlline << "use Archive::Zip qw(:ERROR_CODES); ";
perlline << "my $zip = Archive::Zip->new(); ";
}
std::string replaceAll(std::string result,
const std::string &search, const std::string &replace)
{
std::string::size_type pos = 0;
while(1)
{
pos = result.find(search, pos);
if (pos == std::string::npos) break;
result.replace(pos, search.size(), replace);
pos += replace.size();
}
return result;
}
void JarOutputStream::addFile(const std::string &fileName, const std::string &name)
{
perlline << "$zip->addFile(\"" << replaceAll(fileName, "\\", "/") << "\", \"" << name << "\"); ";
}
void JarOutputStream::addTree(const std::string &tree, const std::string &name)
{
perlline << "$zip->addTree(\"" << replaceAll(tree, "\\", "/") << "\", \"" << name << "\"); ";
}
void JarOutputStream::dontCompress(const std::string &key)
{
perlline << "my $member = $zip->memberNamed(\"" << key << "\"); ";
perlline << "if ($member) { $member->desiredCompressionMethod( COMPRESSION_STORED ); } ";
}
void JarOutputStream::commit()
{
perlline << "print $zip->writeToFileNamed(\"" << replaceAll(getname().native_file_string(), "\\", "/") << "\").\"\\n\"; ";
fs::path tmp = getname();
tmp.append(".perl");
std::string perlfile = replaceAll( tmp.native_file_string(), "\\", "/");
std::ofstream fos(perlfile.c_str());
fos << perlline.str();
fos.close();
std::string myperl("perl");
std::string is4nt;
char* use_shell = getenv( "USE_SHELL" );
if ( use_shell )
is4nt = use_shell;
if( !is4nt.empty() && is4nt == "4nt" )
{
// in SO windows environment perl isn't in the path and
// needs to be fetched from the environment. this doesn't
// work in a cygwin shell as "/usr/bin/perl" will fail in a
// native shell (see system call).
myperl = getenv( "PERL" );
}
std::string commandline;
commandline = myperl + " " + perlfile;
HCDBG(std::cerr << "command line 3 is" << commandline << std::endl);
// on windows, calling perl (either cygwin or native) from a native
// shell the only chance to survive is using "c:/foo" notation
if ( system(commandline.c_str()) )
fprintf (stderr, "ERROR: calling generated perl script failed!\n");
fs::remove(tmp);
}
void HelpLinker::addBookmark( DB* dbBase, std::string thishid,
const std::string& fileB, const std::string& anchorB,
const std::string& jarfileB, const std::string& titleB)
{
HCDBG(std::cerr << "HelpLinker::addBookmark " << thishid << " " <<
fileB << " " << anchorB << " " << jarfileB << " " << titleB << std::endl);
std::string temp = thishid;
std::transform (temp.begin(), temp.end(), temp.begin(), toupper);
std::replace(temp.begin(), temp.end(), ':', '_');
const std::string& translatedHid = hidlistTranslation[temp];
if (!translatedHid.empty())
thishid = translatedHid;
thishid = URLEncoder::encode(thishid);
DBT key;
memset(&key, 0, sizeof(key));
key.data = const_cast<char*>(thishid.c_str());
key.size = thishid.length();
int fileLen = fileB.length();
if (!anchorB.empty())
fileLen += (1 + anchorB.length());
int dataLen = 1 + fileLen + 1 + jarfileB.length() + 1 + titleB.length();
std::vector<unsigned char> dataB(dataLen);
size_t i = 0;
dataB[i++] = static_cast<unsigned char>(fileLen);
for (size_t j = 0; j < fileB.length(); ++j)
dataB[i++] = fileB[j];
if (!anchorB.empty())
{
dataB[i++] = '#';
for (size_t j = 0; j < anchorB.length(); ++j)
dataB[i++] = anchorB[j];
}
dataB[i++] = static_cast<unsigned char>(jarfileB.length());
for (size_t j = 0; j < jarfileB.length(); ++j)
dataB[i++] = jarfileB[j];
dataB[i++] = static_cast<unsigned char>(titleB.length());
for (size_t j = 0; j < titleB.length(); ++j)
dataB[i++] = titleB[j];
DBT data;
memset(&data, 0, sizeof(data));
data.data = &dataB[0];
data.size = dataB.size();
dbBase->put(dbBase, NULL, &key, &data, 0);
}
void HelpLinker::createFileFromBytes(const std::string &fileName,
const std::string &defaultXSL)
{
std::ofstream fos((indexDirParentName / fileName).native_file_string().c_str());
fos << defaultXSL;
}
void HelpLinker::initXMLIndexBuilder()
{
std::string mod = module;
std::transform (mod.begin(), mod.end(), mod.begin(), tolower);
indexDirName = indexDirParentName / (mod + ".idx");
fs::create_directory(indexDirName);
if (xmlIndexBuilder) delete xmlIndexBuilder;
xmlIndexBuilder = new XmlIndexBuilder(indexDirName);
std::string defaultXSL =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
"\t<xsl:template match=\"*|/\"/>\n"
"</xsl:stylesheet>";
createFileFromBytes("default.xsl", defaultXSL);
xmlIndexBuilder->clearIndex(); // Build index from scratch
xmlIndexBuilder->setTransformLocation(indexDirParentName);
}
namespace
{
fs::path gettmppath()
{
fs::path ret;
osl::File::createTempFile(0, 0, &ret.data);
fs::remove(ret);
return ret;
}
}
extern "C" void function_orig_pointer(xmlXPathParserContextPtr ctxt, int nargs)
{
if (nargs > 1)
{
// TODO: Change when used for extensions, no exception possible here
std::cerr << "function_orig_pointer, too many args" << std::endl;
exit(-1);
}
xmlNodePtr cur = NULL;
if (nargs == 0)
cur = ctxt->context->node;
else if (nargs == 1)
{
xmlXPathObjectPtr obj = valuePop(ctxt);
xmlNodeSetPtr nodelist = obj->nodesetval;
if ((nodelist == NULL) || (nodelist->nodeNr <= 0))
{
// TODO: Change when used for extensions, no exception possible here
std::cerr << "function_orig_pointer, bad nodeset" << std::endl;
exit(-1);
}
cur = nodelist->nodeTab[0];
for (int i = 1; i < nodelist->nodeNr; ++i)
{
int ret = xmlXPathCmpNodes(cur, nodelist->nodeTab[i]);
if (ret == -1)
cur = nodelist->nodeTab[i];
}
xmlXPathFreeObject(obj);
}
if (cur == NULL)
{
// TODO: Change when used for extensions, no exception possible here
std::cerr << "function_orig_pointer, bad node" << std::endl;
exit(-1);
}
static xmlChar str[20];
sprintf((char *)str, "%ld", (sal_uIntPtr)(cur));
valuePush(ctxt, xmlXPathNewString(str));
}
extern "C" void* cmc_module_init(xsltTransformContextPtr ctxt, const xmlChar* uri)
{
if (xsltRegisterExtFunction(ctxt, (const xmlChar*)"orig-pointer", uri, function_orig_pointer))
{
// TODO: Change when used for extensions, no exception possible here
std::cerr << "failure to register function_orig_pointer" << std::endl;
exit(-1);
}
return NULL;
}
extern "C" void cmc_module_term(xsltTransformContextPtr, const xmlChar*, void*)
{
}
/**
*
*/
void HelpLinker::link() throw( HelpProcessingException )
{
bool bIndexForExtension = false; // TODO
if( bExtensionMode )
{
indexDirParentName = sourceRoot;
}
else
{
indexDirParentName = gettmppath();
fs::create_directory(indexDirParentName);
}
#ifdef CMC_DEBUG
std::cerr << "will not delete tmpdir of " << indexDirParentName.native_file_string().c_str() << std::endl;
#endif
std::string mod = module;
std::transform (mod.begin(), mod.end(), mod.begin(), tolower);
// Determine the outputstream
fs::path outputTmpFile;
if( !bExtensionMode )
{
outputTmpFile = outputFile;
outputTmpFile.append(".tmp");
jarOutputStream.setname(outputTmpFile);
}
// do the work here
// continue with introduction of the overall process thing into the
// here all hzip files will be worked on
std::string appl = mod;
if (appl[0] == 's')
appl = appl.substr(1);
fs::path helpTextFileName(indexDirParentName / (mod + ".ht"));
DB* helpText(0);
db_create(&helpText,0,0);
helpText->open(helpText, NULL, helpTextFileName.native_file_string().c_str(), NULL, DB_BTREE,
DB_CREATE | DB_TRUNCATE, 0644);
fs::path dbBaseFileName(indexDirParentName / (mod + ".db"));
DB* dbBase(0);
db_create(&dbBase,0,0);
dbBase->open(dbBase, NULL, dbBaseFileName.native_file_string().c_str(), NULL, DB_BTREE,
DB_CREATE | DB_TRUNCATE, 0644);
fs::path keyWordFileName(indexDirParentName / (mod + ".key"));
DB* keyWord(0);
db_create(&keyWord,0,0);
keyWord->open(keyWord, NULL, keyWordFileName.native_file_string().c_str(), NULL, DB_BTREE,
DB_CREATE | DB_TRUNCATE, 0644);
HelpKeyword helpKeyword;
// catch HelpProcessingException to avoid locking data bases
try
{
std::ifstream fileReader(hid.c_str());
while (fileReader)
{
std::string key;
fileReader >> key;
std::transform (key.begin(), key.end(), key.begin(), toupper);
std::replace(key.begin(), key.end(), ':', '_');
std::string data;
fileReader >> data;
if (!key.empty() && !data.empty())
hidlistTranslation[key] = data;
}
// lastly, initialize the indexBuilder
if ( (!bExtensionMode || bIndexForExtension) && !helpFiles.empty())
initXMLIndexBuilder();
if( !bExtensionMode )
{
std::cout << "Making " << outputFile.native_file_string() <<
" from " << helpFiles.size() << " input files" << std::endl;
}
// here we start our loop over the hzip files.
HashSet::iterator end = helpFiles.end();
for (HashSet::iterator iter = helpFiles.begin(); iter != end; ++iter)
{
std::cout << ".";
std::cout.flush();
// process one file
// streamTable contains the streams in the hzip file
StreamTable streamTable;
const std::string &xhpFileName = *iter;
if (!bExtensionMode && xhpFileName.rfind(".xhp") != xhpFileName.length()-4)
{
// only work on .xhp - files
std::cerr <<
"ERROR: input list entry '"
<< xhpFileName
<< "' has the wrong extension (only files with extension .xhp "
<< "are accepted)";
continue;
}
fs::path langsourceRoot(sourceRoot);
fs::path xhpFile;
if( bExtensionMode )
{
// langsourceRoot == sourceRoot for extensions
std::string xhpFileNameComplete( extensionPath );
xhpFileNameComplete.append( '/' + xhpFileName );
xhpFile = fs::path( xhpFileNameComplete );
}
else
{
langsourceRoot.append('/' + lang + '/');
xhpFile = fs::path(xhpFileName, fs::native);
}
HelpCompiler hc( streamTable, xhpFile, langsourceRoot,
embeddStylesheet, module, lang, bExtensionMode );
HCDBG(std::cerr << "before compile of " << xhpFileName << std::endl);
bool success = hc.compile();
HCDBG(std::cerr << "after compile of " << xhpFileName << std::endl);
if (!success && !bExtensionMode)
{
std::stringstream aStrStream;
aStrStream <<
"\nERROR: compiling help particle '"
<< xhpFileName
<< "' for language '"
<< lang
<< "' failed!";
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
const std::string documentBaseId = streamTable.document_id;
std::string documentPath = streamTable.document_path;
if (documentPath.find("/") == 0)
documentPath = documentPath.substr(1);
std::string documentJarfile = streamTable.document_module + ".jar";
std::string documentTitle = streamTable.document_title;
if (documentTitle.empty())
documentTitle = "<notitle>";
#if 0
std::cout << "for " << xhpFileName << " documentBaseId is " << documentBaseId << "\n";
std::cout << "for " << xhpFileName << " documentPath is " << documentPath << "\n";
std::cout << "for " << xhpFileName << " documentJarfile is " << documentJarfile << "\n";
std::cout << "for " << xhpFileName << " documentPath is " << documentTitle << "\n";
#endif
const std::string& fileB = documentPath;
const std::string& jarfileB = documentJarfile;
std::string& titleB = documentTitle;
// add once this as its own id.
addBookmark(dbBase, documentPath, fileB, std::string(), jarfileB, titleB);
if ( (!bExtensionMode || bIndexForExtension) && init)
{
std::ifstream indexXSLFile(indexStylesheet.native_file_string().c_str());
std::ostringstream baos;
baos << indexXSLFile.rdbuf();
std::string xsl = baos.str();
//I see that we later generate a map of generateids to nodes which we will use
//to link the results of generate-id in the transformed document back to the nodes
//in the original document, so let's cut out the middle-men and make an extension
//which does exactly what we want, and give us a pointer to the original node
xsl.replace(xsl.find("<xsl:stylesheet"), strlen("<xsl:stylesheet"),
"<xsl:stylesheet extension-element-prefixes=\"CMC\" xmlns:CMC=\"http://www.cunninghack.org\"");
xsl.replace(xsl.find("generate-id"), strlen("generate-id"), "CMC:orig-pointer");
if (xsltRegisterExtModule((const xmlChar*)"http://www.cunninghack.org", cmc_module_init, cmc_module_term))
{
std::stringstream aStrStream;
aStrStream << "fatal error on registering xslt module" << std::endl;
throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
}
createFileFromBytes("index.xsl", xsl);
xmlIndexBuilder->init("index");
init = false;
}
// first the database *.db
// ByteArrayInputStream bais = null;
// ObjectInputStream ois = null;
const HashSet *hidlist = streamTable.appl_hidlist;
if (!hidlist)
hidlist = streamTable.default_hidlist;
if (hidlist && !hidlist->empty())
{
// now iterate over all elements of the hidlist
HashSet::const_iterator aEnd = hidlist->end();
for (HashSet::const_iterator hidListIter = hidlist->begin();
hidListIter != aEnd; ++hidListIter)
{
std::string thishid = *hidListIter;
std::string anchorB;
size_t index = thishid.rfind('#');
if (index != std::string::npos)
{
anchorB = thishid.substr(1 + index);
thishid = thishid.substr(0, index);
}
addBookmark(dbBase, thishid, fileB, anchorB, jarfileB, titleB);
}
}
// now the keywords
const Hashtable *anchorToLL = streamTable.appl_keywords;
if (!anchorToLL)
anchorToLL = streamTable.default_keywords;
if (anchorToLL && !anchorToLL->empty())
{
std::string fakedHid = URLEncoder::encode(documentPath);
Hashtable::const_iterator aEnd = anchorToLL->end();
for (Hashtable::const_iterator enumer = anchorToLL->begin();
enumer != aEnd; ++enumer)
{
const std::string &anchor = enumer->first;
addBookmark(dbBase, documentPath, fileB,
anchor, jarfileB, titleB);
std::string totalId = fakedHid + "#" + anchor;
// std::cerr << hzipFileName << std::endl;
const LinkedList& ll = enumer->second;
LinkedList::const_iterator aOtherEnd = ll.end();
for (LinkedList::const_iterator llIter = ll.begin();
llIter != aOtherEnd; ++llIter)
{
helpKeyword.insert(*llIter, totalId);
}
}
}
// and last the helptexts
const Stringtable *helpTextHash = streamTable.appl_helptexts;
if (!helpTextHash)
helpTextHash = streamTable.default_helptexts;
if (helpTextHash && !helpTextHash->empty())
{
Stringtable::const_iterator aEnd = helpTextHash->end();
for (Stringtable::const_iterator helpTextIter = helpTextHash->begin();
helpTextIter != aEnd; ++helpTextIter)
{
std::string helpTextId = helpTextIter->first;
const std::string& helpTextText = helpTextIter->second;
std::string temp = helpTextId;
std::transform (temp.begin(), temp.end(), temp.begin(), toupper);
std::replace(temp.begin(), temp.end(), ':', '_');
const std::string& tHid = hidlistTranslation[temp];
if (!tHid.empty())
helpTextId = tHid;
helpTextId = URLEncoder::encode(helpTextId);
DBT keyDbt;
memset(&keyDbt, 0, sizeof(keyDbt));
keyDbt.data = const_cast<char*>(helpTextId.c_str());
keyDbt.size = helpTextId.length();
DBT textDbt;
memset(&textDbt, 0, sizeof(textDbt));
textDbt.data = const_cast<char*>(helpTextText.c_str());
textDbt.size = helpTextText.length();
helpText->put(helpText, NULL, &keyDbt, &textDbt, 0);
}
}
if( !bExtensionMode || bIndexForExtension )
{
// now the indexing
xmlDocPtr document = streamTable.appl_doc;
if (!document)
document = streamTable.default_doc;
if (document)
{
std::string temp = module;
std::transform (temp.begin(), temp.end(), temp.begin(), tolower);
xmlIndexBuilder->indexDocument(document,
std::string("vnd.sun.star.help://")
+ temp
+ "/"
+ URLEncoder::encode(documentPath),
"");
}
}
} // while loop over hzip files ending
if( !bExtensionMode )
std::cout << std::endl;
} // try
catch( HelpProcessingException& )
{
// catch HelpProcessingException to avoid locking data bases
helpText->close(helpText, 0);
dbBase->close(dbBase, 0);
keyWord->close(keyWord, 0);
throw;
}
helpText->close(helpText, 0);
dbBase->close(dbBase, 0);
helpKeyword.dump(keyWord);
keyWord->close(keyWord, 0);
if (!bExtensionMode && !helpFiles.empty())
{
closeXMLIndexBuilder();
HCDBG(std::cerr << "dir is " << indexDirName.native_directory_string() << std::endl);
jarOutputStream.addTree(indexDirName.native_file_string(), mod + ".idx");
}
if( !bExtensionMode )
{
jarOutputStream.addFile(helpTextFileName.native_file_string(), mod + ".ht");
jarOutputStream.addFile(dbBaseFileName.native_file_string(), mod + ".db");
jarOutputStream.addFile(keyWordFileName.native_file_string(), mod + ".key");
/////////////////////////////////////////////////////////////////////////
// last, all files which should be copied into the jar file
/////////////////////////////////////////////////////////////////////////
Stringtable::iterator aEnd = additionalFiles.end();
for (Stringtable::iterator enumer = additionalFiles.begin(); enumer != aEnd;
++enumer)
{
const std::string &additionalFileKey = enumer->first;
const std::string &additionalFileName = enumer->second;
jarOutputStream.addFile(additionalFileName, additionalFileKey);
}
jarOutputStream.dontCompress(mod + ".jar");
jarOutputStream.commit();
HCDBG(std::cerr << "like to rename " << outputTmpFile.native_file_string() << " as " <<
outputFile.native_file_string() << std::endl);
fs::rename(outputTmpFile, outputFile);
if (!fs::exists(outputFile))
{
std::stringstream aStrStream;
aStrStream << "can't rename file '" << outputTmpFile.native_file_string() << "'" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
}
/////////////////////////////////////////////////////////////////////////
/// remove temprary directory for index creation
/////////////////////////////////////////////////////////////////////////
#ifndef CMC_DEBUG
if( !bExtensionMode )
fs::remove_all( indexDirParentName );
#endif
}
int HelpLinker::locCount;
int HelpLinker::totCount;
Stringtable HelpLinker::additionalFiles;
HashSet HelpLinker::helpFiles;
fs::path HelpLinker::sourceRoot;
fs::path HelpLinker::embeddStylesheet, HelpLinker::indexStylesheet;
fs::path HelpLinker::outputFile;
std::string HelpLinker::module;
std::string HelpLinker::lang;
std::string HelpLinker::hid;
std::string HelpLinker::extensionPath;
bool HelpLinker::bExtensionMode;
int GnTmpFileCounter = 0;
void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPath)
throw( HelpProcessingException )
{
bExtensionMode = false;
if( pExtensionPath && pExtensionPath->length() > 0 )
{
helpFiles.clear();
bExtensionMode = true;
extensionPath = *pExtensionPath;
sourceRoot = fs::path(extensionPath);
}
if (args.size() > 0 && args[0][0] == '@')
{
std::vector<std::string> stringList;
std::string strBuf;
std::ifstream fileReader(args[0].substr(1).c_str());
while (fileReader)
{
std::string token;
fileReader >> token;
if (!token.empty())
stringList.push_back(token);
}
args = stringList;
}
size_t i = 0;
while (i < args.size())
{
if (args[i].compare("-src") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "sourceroot missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if( !bExtensionMode )
sourceRoot = fs::path(args[i], fs::native);
}
else if (args[i].compare("-sty") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "embeddingStylesheet missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
embeddStylesheet = fs::path(args[i], fs::native);
}
else if (args[i].compare("-idx") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "indexstylesheet missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
indexStylesheet = fs::path(args[i], fs::native);
}
else if (args[i].compare("-o") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "outputfilename missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
outputFile = fs::path(args[i], fs::native);
}
else if (args[i].compare("-mod") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "module name missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
module = args[i];
}
else if (args[i].compare("-lang") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "language name missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
lang = args[i];
}
else if (args[i].compare("-hid") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "hid list missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
hid = args[i];
}
else if (args[i].compare("-add") == 0)
{
std::string addFile, addFileUnderPath;
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "pathname missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
addFileUnderPath = args[i];
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
aStrStream << "pathname missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
addFile = args[i];
if (!addFileUnderPath.empty() && !addFile.empty())
additionalFiles[addFileUnderPath] = addFile;
}
else
helpFiles.push_back(args[i]);
++i;
}
if (!bExtensionMode && indexStylesheet.empty())
{
std::stringstream aStrStream;
aStrStream << "no index file given" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (!bExtensionMode && embeddStylesheet.empty())
{
std::stringstream aStrStream;
aStrStream << "no embedding resolving file given" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (sourceRoot.empty())
{
std::stringstream aStrStream;
aStrStream << "no sourceroot given" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (!bExtensionMode && outputFile.empty())
{
std::stringstream aStrStream;
aStrStream << "no output file given" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (module.empty())
{
std::stringstream aStrStream;
aStrStream << "module missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (!bExtensionMode && lang.empty())
{
std::stringstream aStrStream;
aStrStream << "language missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (!bExtensionMode && hid.empty())
{
std::stringstream aStrStream;
aStrStream << "hid list missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
HelpLinker().link();
}
int main(int argc, char**argv)
{
sal_uInt32 starttime = osl_getGlobalTimer();
std::vector<std::string> args;
for (int i = 1; i < argc; ++i)
args.push_back(std::string(argv[i]));
try
{
HelpLinker::main(args);
}
catch( const HelpProcessingException& e )
{
std::cerr << e.m_aErrorMsg;
exit(1);
}
sal_uInt32 endtime = osl_getGlobalTimer();
std::cout << "time taken was " << (endtime-starttime)/1000.0 << " seconds" << std::endl;
return 0;
}
// Variable to set an exception in "C" StructuredXMLErrorFunction
static const HelpProcessingException* GpXMLParsingException = NULL;
extern "C" void StructuredXMLErrorFunction(void *userData, xmlErrorPtr error)
{
(void)userData;
(void)error;
std::string aErrorMsg = error->message;
std::string aXMLParsingFile;
if( error->file != NULL )
aXMLParsingFile = error->file;
int nXMLParsingLine = error->line;
HelpProcessingException* pException = new HelpProcessingException( aErrorMsg, aXMLParsingFile, nXMLParsingLine );
GpXMLParsingException = pException;
// Reset error handler
xmlSetStructuredErrorFunc( NULL, NULL );
}
HelpProcessingErrorInfo& HelpProcessingErrorInfo::operator=( const struct HelpProcessingException& e )
{
m_eErrorClass = e.m_eErrorClass;
rtl::OString tmpErrorMsg( e.m_aErrorMsg.c_str() );
m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, osl_getThreadTextEncoding() );
rtl::OString tmpXMLParsingFile( e.m_aXMLParsingFile.c_str() );
m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, osl_getThreadTextEncoding() );
m_nXMLParsingLine = e.m_nXMLParsingLine;
return *this;
}
// Returns true in case of success, false in case of error
HELPLINKER_DLLPUBLIC bool compileExtensionHelp
(
const rtl::OUString& aExtensionName,
const rtl::OUString& aExtensionLanguageRoot,
sal_Int32 nXhpFileCount, const rtl::OUString* pXhpFiles,
HelpProcessingErrorInfo& o_rHelpProcessingErrorInfo
)
{
bool bSuccess = true;
sal_Int32 argc = nXhpFileCount + 3;
const char** argv = new const char*[argc];
argv[0] = "";
argv[1] = "-mod";
rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, osl_getThreadTextEncoding() );
argv[2] = aOExtensionName.getStr();
for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp )
{
rtl::OUString aXhpFile = pXhpFiles[iXhp];
rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, osl_getThreadTextEncoding() );
char* pArgStr = new char[aOXhpFile.getLength() + 1];
strcpy( pArgStr, aOXhpFile.getStr() );
argv[iXhp + 3] = pArgStr;
}
std::vector<std::string> args;
for( sal_Int32 i = 1; i < argc; ++i )
args.push_back(std::string( argv[i]) );
for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp )
delete argv[iXhp + 3];
delete[] argv;
rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, osl_getThreadTextEncoding() );
const char* pExtensionPath = aOExtensionLanguageRoot.getStr();
std::string aStdStrExtensionPath = pExtensionPath;
// Set error handler
xmlSetStructuredErrorFunc( NULL, (xmlStructuredErrorFunc)StructuredXMLErrorFunction );
try
{
HelpLinker::main(args,&aStdStrExtensionPath);
}
catch( const HelpProcessingException& e )
{
if( GpXMLParsingException != NULL )
{
o_rHelpProcessingErrorInfo = *GpXMLParsingException;
delete GpXMLParsingException;
GpXMLParsingException = NULL;
}
else
{
o_rHelpProcessingErrorInfo = e;
}
bSuccess = false;
}
// Reset error handler
xmlSetStructuredErrorFunc( NULL, NULL );
// i83624: Tree files
::rtl::OUString aTreeFileURL = aExtensionLanguageRoot;
aTreeFileURL += rtl::OUString::createFromAscii( "/help.tree" );
osl::DirectoryItem aTreeFileItem;
osl::FileBase::RC rcGet = osl::DirectoryItem::get( aTreeFileURL, aTreeFileItem );
osl::FileStatus aFileStatus( FileStatusMask_FileSize );
if( rcGet == osl::FileBase::E_None &&
aTreeFileItem.getFileStatus( aFileStatus ) == osl::FileBase::E_None &&
aFileStatus.isValid( FileStatusMask_FileSize ) )
{
sal_uInt64 ret, len = aFileStatus.getFileSize();
char* s = new char[ int(len) ]; // the buffer to hold the installed files
osl::File aFile( aTreeFileURL );
aFile.open( OpenFlag_Read );
aFile.read( s, len, ret );
aFile.close();
XML_Parser parser = XML_ParserCreate( 0 );
int parsed = XML_Parse( parser, s, int( len ), true );
if( parsed == 0 )
{
XML_Error nError = XML_GetErrorCode( parser );
o_rHelpProcessingErrorInfo.m_eErrorClass = HELPPROCESSING_XMLPARSING_ERROR;
o_rHelpProcessingErrorInfo.m_aErrorMsg = rtl::OUString::createFromAscii( XML_ErrorString( nError ) );;
o_rHelpProcessingErrorInfo.m_aXMLParsingFile = aTreeFileURL;
// CRAHSES!!! o_rHelpProcessingErrorInfo.m_nXMLParsingLine = XML_GetCurrentLineNumber( parser );
bSuccess = false;
}
XML_ParserFree( parser );
delete[] s;
}
return bSuccess;
}
// vnd.sun.star.help://swriter/52821?Language=en-US&System=UNIX
/* vi:set tabstop=4 shiftwidth=4 expandtab: */