/************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * Copyright 2008 by Sun Microsystems, Inc. * * OpenOffice.org - a multi-platform office productivity suite * * $RCSfile: HelpLinker.cxx,v $ * $Revision: 1.13 $ * * This file is part of OpenOffice.org. * * OpenOffice.org is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3 * only, as published by the Free Software Foundation. * * OpenOffice.org is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License version 3 for more details * (a copy is included in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU Lesser General Public License * version 3 along with OpenOffice.org. If not, see * * for a copy of the LGPLv3 License. * ************************************************************************/ #include "HelpCompiler.hxx" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSTEM_EXPAT #include #else #include #endif class JarOutputStream { private: fs::path filename; std::ostringstream perlline; public: JarOutputStream(); void setname(const fs::path &name) { filename = name; } const fs::path& getname() const { return filename; } void addFile(const std::string &name, const std::string &key); void addTree(const std::string &dir, const std::string &key); void dontCompress(const std::string &key); void commit(); }; struct Data { std::vector _idList; typedef std::vector::const_iterator cIter; void append(const std::string &id) { _idList.push_back(id); } std::string getString() const { std::string ret; cIter aEnd = _idList.end(); for (cIter aIter = _idList.begin(); aIter != aEnd; ++aIter) ret += *aIter + ";"; return ret; } }; class HelpKeyword { private: typedef std::hash_map DataHashtable; DataHashtable _hash; public: void insert(const std::string &key, const std::string &id) { Data &data = _hash[key]; data.append(id); } void dump(DB* table) { DataHashtable::const_iterator aEnd = _hash.end(); for (DataHashtable::const_iterator aIter = _hash.begin(); aIter != aEnd; ++aIter) { const std::string &keystr = aIter->first; DBT key; memset(&key, 0, sizeof(key)); key.data = const_cast(keystr.c_str()); key.size = keystr.length(); const Data &data = aIter->second; std::string str = data.getString(); DBT value; memset(&value, 0, sizeof(value)); value.data = const_cast(str.c_str()); value.size = str.length(); table->put(table, NULL, &key, &value, 0); } } }; namespace PrefixTranslator { std::string translatePrefix(const std::string &input) { if (input.find("vnd.sun.star.help://") == 0) return std::string("#HLP#") + input.substr(strlen("vnd.sun.star.help://")); else return input; } } class IndexAccessor { fs::path _dirName; public: IndexAccessor(const fs::path &dirName) : _dirName(dirName) {} IndexAccessor(const IndexAccessor &another) { _dirName = another._dirName; } fs::path indexFile(const std::string &name) const { return _dirName / name; } std::ifstream* getLineInput(const std::string &name); std::fstream* getOutputStream(const std::string &name); std::vector readByteArray(const std::string &fileName); void clear(); std::fstream *getRAF(const std::string &name, bool update) throw( HelpProcessingException ); void createIfNeeded() {} }; std::ifstream* IndexAccessor::getLineInput(const std::string &name) { return new std::ifstream(indexFile(name).native_file_string().c_str()); } std::fstream* IndexAccessor::getOutputStream(const std::string &name) { return new std::fstream(indexFile(name).native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); } std::vector IndexAccessor::readByteArray(const std::string &fileName) { std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); std::vector ret(1024*16); int i=0; while (in.good()) { int len = in.readsome((char *)&ret[i], 1024*16); if (!len) break; i += len; ret.resize(i+1024*16); } ret.resize(i); return ret; } std::fstream* IndexAccessor::getRAF(const std::string &name, bool update) throw( HelpProcessingException ) { std::fstream *_file = new std::fstream; fs::path fullname = indexFile(name); if (!update) { _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::binary); } else { _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); if (!_file->is_open()) { HCDBG(std::cerr << "didn't exist" << std::endl); _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); } if (!_file->is_open()) { std::stringstream aStrStream; aStrStream << "Cannot open " << name << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } } return _file; } void IndexAccessor::clear() { #if 0 File thisDir = indexFile("."); File[] components = thisDir.listFiles(); if (components != null) for (int i = 0; i < components.length; i++) components[i].delete(); #endif } typedef std::vector< std::string > VectorLines; class Schema : public IndexAccessor { private: static std::string PartName; bool _update; VectorLines _lines; public: Schema(const IndexAccessor &index, bool update); std::ifstream* getSchemaLineInput() { return getLineInput(PartName); } void read(); Stringtable parameters(const std::string &name) const; void update(const std::string &partName, const std::string ¶meters); void save(); }; std::string Schema::PartName = "SCHEMA"; class startsWith { public: startsWith(const std::string &in) : str(in) {} bool operator() ( const std::string &in ) const { return (in.find(str) == 0); } private: const std::string &str; }; void Schema::update(const std::string &partName, const std::string &inparameters) { VectorLines::iterator aEnd = std::remove_if(_lines.begin(), _lines.end(), startsWith(partName)); if (aEnd != _lines.end()) _lines.erase(aEnd, _lines.end()); _lines.push_back(partName + " " + inparameters); } Stringtable Schema::parameters(const std::string &name) const { Stringtable result; VectorLines::const_iterator aEnd = _lines.end(); for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) { if (aIter->find(name) == 0) { boost::char_separator sep(" ="); boost::tokenizer< boost::char_separator > tokens(name, sep); boost::tokenizer< boost::char_separator >::const_iterator it = tokens.begin(); ++it; // skip name while(it != tokens.end()) { const std::string &part1 = *it; ++it; if (it == tokens.end()) break; const std::string &part2 = *it; result[part1] = part2; ++it; } break; } } return result; } Schema::Schema(const IndexAccessor &index, bool inupdate) : IndexAccessor(index), _update(inupdate) { read(); } #ifdef UNX #define MAX_LINE PATH_MAX #else #define MAX_LINE _MAX_PATH #endif void Schema::read() { std::ifstream* in = getSchemaLineInput(); char line[MAX_LINE]; // This needs to be replaced with our XML Parser while (in->getline(line, MAX_LINE)) _lines.push_back(line); delete in; } void Schema::save() { if (_update) { std::fstream* out = getOutputStream(PartName); *out << "JavaSearch 1.0\n"; VectorLines::const_iterator aEnd = _lines.end(); for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) *out << *aIter << '\n'; delete out; } } class DBPartParameters { Schema &_schema; std::string _partName; Stringtable _parameters; protected: bool parametersKnown() const; void updateSchema(const std::string ¶meters) { _schema.update(_partName, parameters); } public: DBPartParameters(Schema &schema, const std::string &partName); int integerParameter(const std::string &name); }; DBPartParameters::DBPartParameters(Schema &schema, const std::string &partName) : _schema(schema), _partName(partName) { _parameters = schema.parameters(partName); } bool DBPartParameters::parametersKnown() const { return !_parameters.empty(); } int DBPartParameters::integerParameter(const std::string &name) { std::istringstream converter(_parameters[name]); int ret; converter >> ret; return ret; } class BlockManagerParameters : public DBPartParameters { private: fs::path _file; int _blockSize; protected: int _root; public: BlockManagerParameters(Schema &schema, const std::string &partName); bool readState(); const fs::path& getFile() const { return _file; } int getBlockSize() const { return _blockSize; } void setBlockSize(int size) { _blockSize = size; } int getRootPosition() const { return _root; } void setRoot(int root) { _root = root; } void updateSchema(const std::string ¶ms); }; void BlockManagerParameters::updateSchema(const std::string ¶ms) { std::ostringstream tmp; tmp << "bs=" << _blockSize << " rt=" << _root << " fl=-1 " << params; DBPartParameters::updateSchema(tmp.str()); } BlockManagerParameters::BlockManagerParameters(Schema &schema, const std::string &partName) : DBPartParameters(schema, partName), _root(0) { _file = schema.indexFile(partName); HCDBG(std::cerr << "file name set to " << _file.native_file_string()); readState(); } bool BlockManagerParameters::readState() { if (parametersKnown()) { _blockSize = integerParameter("bs"); _root = integerParameter("rt"); return true; } else return false; } class BtreeDictParameters : public BlockManagerParameters { private: int _id1; public: BtreeDictParameters(Schema &schema, const std::string &partName); int getFreeID() const { return _id1; } void setFreeID(int id) { _id1 = id; } void updateSchema(); }; void BtreeDictParameters::updateSchema() { std::ostringstream tmp; tmp << "id1=" << _id1 << " id2=1"; BlockManagerParameters::updateSchema(tmp.str()); } BtreeDictParameters::BtreeDictParameters(Schema &schema, const std::string &partName) : BlockManagerParameters(schema, partName) { } int readInt(std::fstream &in) { HCDBG(std::cerr << "want to read at " << in.tellg() << std::endl); int ret = 0; for (int i = 3; i >= 0; --i) { unsigned char byte; in.read( (char*)&byte, 1 ); ret |= (static_cast(byte) << (i*8)); HCDBG(fprintf(stderr, "inputting %x ret is now %x\n", byte, ret)); } return ret; } void writeByte(std::fstream &out, unsigned char byte) { out.write( (const char *)&byte, 1 ); } void writeShort(std::fstream &out, int item) { for (int i = 1; i >= 0; --i) { unsigned char byte = static_cast((item >> (i*8))); out.write( (const char*)&byte, 1 ); } } void writeInt(std::fstream &out, int item) { HCDBG(std::cerr << "want to write at " << out.tellp() << std::endl); for (int i = 3; i >= 0; --i) { unsigned char byte = static_cast((item >> (i*8))); HCDBG(fprintf(stderr, "outputting %x in is %x\n", byte, item)); out.write( (const char*)&byte, 1 ); } } void readFully(std::fstream &in, std::vector &_data) { in.read((char*)(&_data[0]), _data.size()); } /** Base class for (typically btree) blocks to hold either byte vectors representing graph/tree edges, or pairs (key, id) for dictionaries Each block has a header and a data section */ class Block { public: static int HEADERLEN; // length of Block ID in bytes static int IDLEN; // number of the block // used for both referring to the block // and addresssing the block in file unsigned int _number; bool _isLeaf; // first available byte in data section int _free; std::vector _data; Block(int blocksize) : _number(0), _isLeaf(true), _free(0) { _data.resize(blocksize - HEADERLEN); } virtual ~Block() {} void setBlockNumber(int n) { _number = n; } virtual void setFree(int free) { _free = free; } // interpret 4 bytes at 'i' as an integer int integerAt(int i) const { int result = ((((((_data[i]&0xFF)<<8) |_data[i+1]&0xFF)<<8) |_data[i+2]&0xFF)<<8) |_data[i+3]&0xFF; return result; } void setIntegerAt(int i, int value) { /* for (int j = i + 3; j >= i; j--, value >>= 8) _data[j] = (unsigned char)(value & 0xFF); */ _data[i++] = (unsigned char)((value >> 24) & 0xFF); _data[i++] = (unsigned char)((value >> 16) & 0xFF); _data[i++] = (unsigned char)((value >> 8) & 0xFF); _data[i] = (unsigned char)(value & 0xFF); } void readIn(std::fstream &in) { _number = readInt(in); int twoFields = readInt(in); _isLeaf = (twoFields & 0x80000000) != 0; HCDBG(std::cerr << "read leaf as " << _isLeaf << std::endl); _free = twoFields & 0x7FFFFFFF; readFully(in, _data); } void writeOut(std::fstream &out) const { writeInt(out, _number); writeInt(out, _free | (_isLeaf ? 0x80000000 : 0)); out.write((const char*)(&_data[0]), _data.size()); } }; int Block::HEADERLEN = 8; // length of Block ID in bytes int Block::IDLEN = 4; class BtreeDict; class EntryProcessor; typedef std::vector IntegerArray; class DictBlock : public Block { public: DictBlock(); int free() const { return _free + firstEntry(); } int numberOfEntries() const { return integerAt(0); } int nthPointer(int n) const { return integerAt(4*(n + 1)); } int getChildIdx(int index) const; int entryKeyLength(int i) const { return _data[i] & 0xFF; } int entryCompression(int i) const { return _data[i + 1] & 0xFF; } int entryID(int i) const { return integerAt(i + 2); } int entryLength(int entry) const; int entryKey(int entry) const; int firstEntry() const { return 4; } int nextEntry(int entry) const { return entry + entryLength(entry); } void restoreKeyInBuffer(int entry, std::vector &buffer); std::string restoreKey(int entry, std::vector &buffer); std::string findID(int id) throw( HelpProcessingException ); void setBlockNumbers(std::vector &blocks) const; void listBlock(); void doMap(BtreeDict &owner, const EntryProcessor &processor); void withPrefix(BtreeDict &owner, const std::string &prefix, size_t prefLen, IntegerArray &result); }; class BlockFactory; class BlockProcessor; class BlockDescriptor { public: Block *_block; bool _modf; BlockDescriptor(Block *block) : _block(block), _modf(false) {} }; // end of BlockDescriptor class BlockManager { private: static int INCR; std::fstream _file; long _blockSize; bool _update; BlockFactory *_blockFactory; std::vector _blockTab; public: BlockManager(const BlockManagerParameters *params, bool update, BlockFactory *bfactory) throw( HelpProcessingException ); ~BlockManager(); Block& accessBlock(int blockNumber); void setModified(int blNum); void close(); Block& getNewBlock(); void processBlocks(BlockProcessor &processor); void mapBlock(Block* block); void addDescriptor(Block* block) throw( HelpProcessingException ); private: void writeBlock(const Block &bl); }; int BlockManager::INCR = 64; // size increment class EntryProcessor { public: virtual void processEntry(const std::string &string, int id) const = 0; virtual ~EntryProcessor() {}; }; class BtreeDict { public: static int ENTHEADERLEN; static int BLOCKSIZE; static int DATALEN; static int MaxKeyLength; static int lastPtrIndex; protected: BlockManager *blockManager; int root; std::vector blocks; BtreeDict() {/*empty*/} ~BtreeDict() { delete blockManager; } BtreeDict(const BtreeDictParameters *params); void init(const BtreeDictParameters *params, bool update, BlockFactory *bfactory); public: int fetch(const std::string &key); void close(); private: std::string fetch(int conceptID); IntegerArray withPrefix(const std::string &prefix); public: DictBlock& accessBlock(int index); DictBlock& child(const DictBlock &bl, int index) throw( HelpProcessingException ); private: std::string findID(int blNum, int id); int find(const DictBlock &bl, std::vector &key, int index); int find(const DictBlock &bl, std::vector &key); void setBlocks(std::vector &blocks); void map(const EntryProcessor &processor); public: void dumpnode(DictBlock &bl, int level); }; class BlockFactory { public: virtual Block* makeBlock() const = 0; virtual ~BlockFactory() {} }; static int dictcount; class DictBlockFactory : public BlockFactory { public: Block* makeBlock() const { dictcount++; return new DictBlock; } }; BtreeDict::BtreeDict(const BtreeDictParameters *params) { init(params, false, new DictBlockFactory()); blocks.resize(params->getFreeID()); setBlocks(blocks); } void BtreeDict::dumpnode(DictBlock &bl, int level) { if (!bl._isLeaf) { fprintf(stderr, "\n"); for (int i = 0; i < level; ++i) fprintf(stderr, "\t"); fprintf(stderr, "there are %d entries\n", bl.numberOfEntries()); for (int i = 0; i < level; ++i) fprintf(stderr, "\t"); for (int i = 0; i < bl.numberOfEntries(); ++i) { int index = bl.getChildIdx(i); fprintf(stderr, " %d ", index); DictBlock &thischild = accessBlock(index); dumpnode(thischild, level + 1); } fprintf(stderr, "\n"); } } int BtreeDict::fetch(const std::string &key) { HCDBG(std::cerr << "fetching " << key << " from root " << root << std::endl); DictBlock &bl = accessBlock(root); int length = key.size(); std::vector Key(length + 1); memcpy(&(Key[0]), key.c_str(), length); Key[length] = 0; // sentinel return find(bl, Key); } std::string BtreeDict::fetch(int conceptID) { return findID(blocks[conceptID], conceptID); } IntegerArray BtreeDict::withPrefix(const std::string &prefix) { IntegerArray result; accessBlock(root).withPrefix(*this, prefix, prefix.size(), result); return result; } void BtreeDict::close() { blockManager->close(); } void BtreeDict::init(const BtreeDictParameters *params, bool update, BlockFactory *bfactory) { blockManager = new BlockManager(params, update, bfactory); root = params->getRootPosition(); } DictBlock& BtreeDict::accessBlock(int index) { return (DictBlock&)blockManager->accessBlock(index); } DictBlock& BtreeDict::child(const DictBlock &bl, int index) throw( HelpProcessingException ) { if (bl._isLeaf) { std::stringstream aStrStream; aStrStream << "leaf's can't have children, screwed!" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } return accessBlock(bl.getChildIdx(index)); } std::string BtreeDict::findID(int blNum, int id) { return accessBlock(blNum).findID(id); } int BtreeDict::find(const DictBlock &bl, std::vector &key, int index) { HCDBG(std::cerr << "find2: " << bl._isLeaf << " : " << index << " : " << std::endl); return bl._isLeaf ? 0 : find(child(bl, index), key); } int BtreeDict::find(const DictBlock &bl, std::vector &key) { int inputKeyLen = key.size() - 1; int entryPtr = bl.firstEntry(); int freeSpace = bl.free(); int nCharsEqual = 0; int compression = 0; HCDBG(std::cerr << "find1: " << inputKeyLen << " : " << entryPtr << " : " << freeSpace << " : " << nCharsEqual << " " << compression << std::endl); for (int entryIdx = 0;;) { if (entryPtr == freeSpace) return find(bl, key, bl.numberOfEntries()); else if (compression == nCharsEqual) { int keyLen = bl.entryKeyLength(entryPtr); int keyPtr = bl.entryKey(entryPtr), i; for (i = 0; i < keyLen && key[nCharsEqual] == bl._data[keyPtr + i]; i++) ++nCharsEqual; if (i == keyLen) { if (nCharsEqual == inputKeyLen) return bl.entryID(entryPtr); } else if ((key[nCharsEqual]&0xFF) < (bl._data[keyPtr + i]&0xFF)) return find(bl, key, entryIdx); } else if (compression < nCharsEqual) // compression dropped return find(bl, key, entryPtr == freeSpace ? bl.numberOfEntries() : entryIdx); do { entryPtr = bl.nextEntry(entryPtr); ++entryIdx; } while (bl.entryCompression(entryPtr) > nCharsEqual); compression = bl.entryCompression(entryPtr); } } class BlockProcessor { protected: std::vector &blocks; public: BlockProcessor(std::vector &_blocks) : blocks(_blocks) {} virtual void process(const Block &block) = 0; virtual ~BlockProcessor() {} }; class DictBlockProcessor : public BlockProcessor { public: DictBlockProcessor(std::vector &_blocks) : BlockProcessor(_blocks) {} void process(const Block &block) { ((const DictBlock&)block).setBlockNumbers(blocks); } }; BlockManager::BlockManager(const BlockManagerParameters *params, bool update, BlockFactory *bfactory) throw( HelpProcessingException ) : _blockFactory(bfactory) { _update = update; // params.readState(); _blockSize = params->getBlockSize(); HCDBG(std::cerr << "opening " << params->getFile().native_file_string() << std::endl); if (!update) { _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::binary); } else { _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); if (!_file.is_open()) { HCDBG(std::cerr << "didn't exist" << std::endl); _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); } if (!_file.is_open()) { std::stringstream aStrStream; aStrStream << "Cannot open " << params->getFile().native_file_string() << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } } _file.seekg(0, std::ios::end); long length = _file.tellg(); if (length < 0) length = 0; _file.seekg(0, std::ios::beg); _file.clear(); HCDBG(std::cerr << "len is " << length << std::endl); if (length <= 0 && update) { Block* _dummy = bfactory->makeBlock(); _dummy->setBlockNumber(0); writeBlock(*_dummy); delete _dummy; length = _blockSize; } _file.seekg(0, std::ios::beg); int _blockTableSize = (length/_blockSize); HCDBG(std::cerr << "len is now " << _blockTableSize << std::endl); for (int i = 0; i < _blockTableSize; ++i) mapBlock(bfactory->makeBlock()); } Block& BlockManager::getNewBlock() { unsigned int number = _blockTab.size(); Block *bl = _blockFactory->makeBlock(); bl->setBlockNumber(number); writeBlock(*bl); addDescriptor(bl); return *(_blockTab[number]._block); } void BlockManager::setModified(int blNum) { _blockTab[blNum]._modf = true; } void BlockManager::close() { if (_update) { std::vector::const_iterator aEnd = _blockTab.end(); for (std::vector::const_iterator aIter = _blockTab.begin(); aIter != aEnd; ++aIter) { if (aIter->_modf) writeBlock(*(aIter->_block)); } } _file.close(); } void BlockManager::processBlocks(BlockProcessor &processor) { std::vector::const_iterator aEnd = _blockTab.end(); for (std::vector::const_iterator aIter = _blockTab.begin(); aIter != aEnd; ++aIter) { processor.process(*(aIter->_block)); } } void BlockManager::mapBlock(Block* block) { block->readIn(_file); addDescriptor(block); } void BlockManager::addDescriptor(Block *block) throw( HelpProcessingException ) { BlockDescriptor desc(block); _blockTab.push_back(desc); HCDBG(std::cerr << "numbers are " << block->_number << " " << (_blockTab.size()-1) << std::endl); if (block->_number != _blockTab.size() - 1) { std::stringstream aStrStream; aStrStream << "totally screwed" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } HCDBG(std::cerr << "addDescriptor blocks are now " << _blockTab.size() << std::endl); } void BlockManager::writeBlock(const Block &bl) { _file.seekp(_blockSize * bl._number); bl.writeOut(_file); } Block& BlockManager::accessBlock(int blockNumber) { return *(_blockTab[blockNumber]._block); } BlockManager::~BlockManager() { std::vector::iterator aEnd = _blockTab.end(); for (std::vector::iterator aIter = _blockTab.begin(); aIter != aEnd; ++aIter) { delete aIter->_block; } delete _blockFactory; } void BtreeDict::setBlocks(std::vector &inblocks) { DictBlockProcessor foo(inblocks); blockManager->processBlocks(foo); } // can go to Full void BtreeDict::map(const EntryProcessor &processor) { accessBlock(root).doMap(*this, processor); } void DictBlock::restoreKeyInBuffer(int entry, std::vector &buffer) { int howMany = entryKeyLength(entry); int where = entryCompression(entry); int from = entryKey(entry); while (howMany-- > 0) buffer[where++] = _data[from++]; } std::string DictBlock::restoreKey(int entry, std::vector &buffer) { int howMany = entryKeyLength(entry); int where = entryCompression(entry); int from = entryKey(entry); while (howMany-- > 0) buffer[where++] = _data[from++]; return std::string((const char*)(&buffer[0]), 0, where); } std::string DictBlock::findID(int id) throw( HelpProcessingException ) { std::vector buffer(BtreeDict::MaxKeyLength); int freeSpace = free(); for (int ent = firstEntry(); ent < freeSpace; ent = nextEntry(ent)) { if (entryID(ent) == id) // found return restoreKey(ent, buffer); else restoreKeyInBuffer(ent, buffer); } std::stringstream aStrStream; aStrStream << "ID not found in block" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } void DictBlock::setBlockNumbers(std::vector &blocks) const { for (int e = firstEntry(); e < _free; e = nextEntry(e)) blocks[entryID(e)] = _number; } void DictBlock::listBlock() { std::vector buffer(BtreeDict::MaxKeyLength); int freeSpace = free(); int entryPtr = firstEntry(); if (_isLeaf) { while (entryPtr < freeSpace) { std::cout << restoreKey(entryPtr, buffer) << " " << entryID(entryPtr); entryPtr = nextEntry(entryPtr); } } else std::cout << "not leaf" << std::endl; } void DictBlock::doMap(BtreeDict &owner, const EntryProcessor &processor) { std::vector buffer(BtreeDict::MaxKeyLength); int freeSpace = free(); int entryPtr = firstEntry(); if (_isLeaf) { while (entryPtr < freeSpace) { processor.processEntry(restoreKey(entryPtr, buffer), entryID(entryPtr)); entryPtr = nextEntry(entryPtr); } } else { int entryIdx = 0; while (entryPtr < freeSpace) { owner.accessBlock(getChildIdx(entryIdx)).doMap(owner,processor); processor.processEntry(restoreKey(entryPtr, buffer), entryID(entryPtr)); entryPtr = nextEntry(entryPtr); ++entryIdx; } owner.accessBlock(getChildIdx(entryIdx)).doMap(owner, processor); } } void DictBlock::withPrefix(BtreeDict &owner, const std::string &prefix, size_t prefLen, IntegerArray &result) { std::vector buffer(BtreeDict::MaxKeyLength); int freeSpace = free(); int entryPtr = firstEntry(); if (_isLeaf) { while (entryPtr < freeSpace) { if (restoreKey(entryPtr, buffer).find(prefix) == 0) result.push_back(entryID(entryPtr)); entryPtr = nextEntry(entryPtr); } } else { int entryIndex = 0; while (entryPtr < freeSpace) { std::string key = restoreKey(entryPtr, buffer); if (key.size() > prefLen) key = key.substr(0, prefLen); int cmp = key.compare(prefix); if (cmp < 0) { entryPtr = nextEntry(entryPtr); ++entryIndex; } else if (cmp == 0) { result.push_back(entryID(entryPtr)); owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); entryPtr = nextEntry(entryPtr); ++entryIndex; } else { owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); return; } } owner.accessBlock(getChildIdx(numberOfEntries())).withPrefix(owner, prefix, prefLen, result); } } int BtreeDict::ENTHEADERLEN = 6; int BtreeDict::BLOCKSIZE = 2048; int BtreeDict::DATALEN = BtreeDict::BLOCKSIZE - Block::HEADERLEN; int BtreeDict::MaxKeyLength = 255; //!!! Careful with that number, Eugene int BtreeDict::lastPtrIndex = 508; DictBlock::DictBlock() : Block(BtreeDict::BLOCKSIZE) { } int DictBlock::getChildIdx(int index) const { return nthPointer(BtreeDict::lastPtrIndex - index); } int DictBlock::entryLength(int entry) const { return BtreeDict::ENTHEADERLEN + entryKeyLength(entry); } int DictBlock::entryKey(int entry) const { return entry + BtreeDict::ENTHEADERLEN; } void setBlockNumber2(std::vector &blocks, size_t index, int number) { if (index >= blocks.size()) blocks.resize(index + 1000); blocks[index] = number; } class Entry { public: std::vector key; int id; int block; Entry(const std::vector &keyin, int length, int idin) : key(length+1), id(idin), block(-1) { memcpy(&key[0], &keyin[0], length); } Entry(const std::string &keyin, int idin) : key(keyin.size()+1), id(idin), block(-1) { memcpy(&key[0], keyin.c_str(), keyin.size()); } bool smallerThan(const Entry &other) { for (size_t i = 0; i < std::min(key.size(), other.key.size()); i++) if (key[i] != other.key[i]) return (key[i]&0xFF) < (other.key[i]&0xFF); return false; } }; // end of internal class Entry class FullDictBlock; class FullBtreeDict : public BtreeDict { protected: BtreeDictParameters *_params; bool update; public: FullBtreeDict(BtreeDictParameters ¶ms, bool update); void store(const std::string &bla, int id) throw( HelpProcessingException ); boost::shared_ptr insert(FullDictBlock &bl, boost::shared_ptr ent); boost::shared_ptr insertHere(FullDictBlock &bl, boost::shared_ptr ent) throw( HelpProcessingException ); FullDictBlock& getNewBlock(); void setModified(Block &bl); void close(int freeID); }; class FullDictBlock : public DictBlock { public: virtual void setFree(int free); void setNumberOfEntries(int n) { setIntegerAt(0, n); } void setChildIndex(int index, int value) { setIntegerAt(4*(BtreeDict::lastPtrIndex - index + 1), value); } void setEntryID(int i, int id) { setIntegerAt(i + 2, id); } void setBlockNumbers(std::vector &blocks) const; bool insert(const Entry &entry); void makeEntry(int entry, const std::vector &key, int id, int length, int compr); bool insert(const Entry &ent, int entryPtr, int compr1, int compr2, int index); int insertInternal(const Entry &entry); boost::shared_ptr split(FullDictBlock &newbl); void initInternal(int leftBlock, const Entry &entry); bool insert(boost::shared_ptr entry); bool insert(boost::shared_ptr ent, int entryPtr, int compr1, int compr2, int index); }; void FullDictBlock::initInternal(int leftBlock, const Entry &entry) { _isLeaf = false; setNumberOfEntries(1); setChildIndex(0, leftBlock); setChildIndex(1, entry.block); int ent = firstEntry(); makeEntry(ent, entry.key, entry.id, entry.key.size() - 1, 0); setFree(nextEntry(ent)); } void FullDictBlock::setFree(int infree) { _free = infree - firstEntry(); _data[infree] = _data[infree + 1] = 0; // sentinel } boost::shared_ptr FullDictBlock::split(FullDictBlock& newbl) { std::vector buffer(BtreeDict::MaxKeyLength); int freeSpace = free(); int half = freeSpace/2; int index = 0; // of middle entry newbl._isLeaf = _isLeaf; int ent; for (ent = firstEntry(); ent < half; ent = nextEntry(ent)) { restoreKeyInBuffer(ent, buffer); ++index; } int entriesToMove = numberOfEntries() - index - 1; // middle entry restoreKeyInBuffer(ent, buffer); int len = entryKeyLength(ent) + entryCompression(ent); boost::shared_ptr result(new Entry(buffer, len, entryID(ent))); result->block = newbl._number; int newFree = ent; // rest goes to the new block ent = nextEntry(ent); restoreKeyInBuffer(ent, buffer); len = entryKeyLength(ent) + entryCompression(ent); int nptr = firstEntry(); newbl.makeEntry(nptr, buffer, entryID(ent), len, 0); ent = nextEntry(ent); memmove(&(newbl._data[newbl.nextEntry(nptr)]), &(_data[ent]), freeSpace - ent); newbl.setNumberOfEntries(entriesToMove); newbl.setFree(newbl.nextEntry(nptr) + freeSpace - ent); if (_isLeaf == false) // need to split pointers { int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); int to = from + 4*(index + 1); memmove(&(newbl._data[to]), &(_data[from]), 4*(entriesToMove + 1)); } // this entry will end here setFree(newFree); setNumberOfEntries(index); return result; //!!!remember updating ID -> string association } void FullDictBlock::setBlockNumbers(std::vector &blocks) const { for (int e = firstEntry(); e < _free; e = nextEntry(e)) setBlockNumber2(blocks, entryID(e), _number); } bool FullDictBlock::insert(boost::shared_ptr ent, int entryPtr, int compr1, int compr2, int index) { const std::vector &key = ent->key; int keyLen = key.size() - 1 - compr1; int freeSpace = free(); // calculate how much space is needed to add the new entry // first, how many bytes are needed for just the new entry int demand = BtreeDict::ENTHEADERLEN + keyLen; // adding an entry can increase compression in the following entry int increase = 0; if (entryPtr < freeSpace) if (entryCompression(entryPtr) < compr2) increase = compr2 - entryCompression(entryPtr); /* std::cerr << "key " << key << std::endl; std::cerr << "entryPtr " << entryPtr << std::endl; std::cerr << "compr1 " << compr1) << std::endl; std::cerr << "compr2 " << compr2) << std::endl; std::cerr << "index " << index) << std::endl; std::cerr << "demand " << demand) << std::endl; std::cerr << "increase " << increase) << std::endl; */ // check if enough space is available int limit = _isLeaf ? BtreeDict::DATALEN-2 : 4*(BtreeDict::lastPtrIndex-numberOfEntries()-1); if (freeSpace + demand - increase <= limit) // 2 for sentinel { if (entryPtr < freeSpace) { // need to shift extant entries forward int toMove = increase > 0 ? entryPtr + BtreeDict::ENTHEADERLEN + increase : entryPtr; // move entries memmove(&(_data[toMove + demand - increase]), &(_data[toMove]), freeSpace - toMove); if (increase > 0) { // update header unsigned char tmp = static_cast(increase); _data[entryPtr] = _data[entryPtr] - tmp; _data[entryPtr + 1] = _data[entryPtr + 1] + tmp; // shift header memmove(&(_data[entryPtr + demand]), &(_data[entryPtr]), BtreeDict::ENTHEADERLEN); } } // now write the new entry in the space made above makeEntry(entryPtr, key, ent->id, keyLen, compr1); if (_isLeaf == false) { int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); memmove(&(_data[from - 4]), &(_data[from]), 4*(numberOfEntries() - index)); setChildIndex(index + 1, ent->block); } setFree(freeSpace + demand - increase); setNumberOfEntries(numberOfEntries() + 1); /* System.err.println("------------list--------------"); byte[] buffer = new byte[MaxKeyLength]; final int freeSpace2 = free(); int entryPtr2 = firstEntry(); while (entryPtr2 < freeSpace2) { System.err.println(entryPtr2); System.err.println(entryKeyLength(entryPtr2)); System.err.println(entryCompression(entryPtr2)); System.err.println(new String(_data, entryKey(entryPtr2), entryKeyLength(entryPtr2))); System.err.println(restoreKey(entryPtr2, buffer)+" "+ entryID(entryPtr2)); entryPtr2 = nextEntry(entryPtr2); } System.err.println("------------end--------------"); */ return true; } else return false; } // finds the place and context bool FullDictBlock::insert(boost::shared_ptr entry) { const std::vector &inkey = entry->key; int inputKeyLen = inkey.size() - 1; int freeSpace = free(); int entryPtr = firstEntry(); int nCharsEqual = 0; int prevNCEqual = 0; int compression = 0; for (int entryIndex = 0;;) { if (entryPtr == freeSpace) return insert(entry, entryPtr, nCharsEqual, 0, numberOfEntries()); else if (compression == nCharsEqual) { int keyLen = entryKeyLength(entryPtr); int keyPtr = entryKey(entryPtr), i; prevNCEqual = nCharsEqual; for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) ++nCharsEqual; if (i == keyLen) { if (nCharsEqual == inputKeyLen) { HCDBG(std::cerr << "setting to " << entry->id << std::endl); setEntryID(entryPtr, entry->id); return true; } } else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) return insert(entry, entryPtr, prevNCEqual, nCharsEqual, entryIndex); } else if (compression < nCharsEqual) // compression dropped { int index = entryPtr == freeSpace ? numberOfEntries() : entryIndex; return insert(entry, entryPtr, nCharsEqual, compression, index); } do { entryPtr = nextEntry(entryPtr); ++entryIndex; } while (entryCompression(entryPtr) > nCharsEqual); compression = entryCompression(entryPtr); } } static int fulldictcount; class FullDictBlockFactory : public BlockFactory { public: Block* makeBlock() const { fulldictcount++; return new FullDictBlock; } }; class FullDictBlockProcessor : public BlockProcessor { public: FullDictBlockProcessor(std::vector &_blocks) : BlockProcessor(_blocks) {} void process(const Block &block) { ((const FullDictBlock&)block).setBlockNumbers(blocks); } }; FullBtreeDict::FullBtreeDict(BtreeDictParameters ¶ms, bool _update) : _params(¶ms), update(_update) { init(_params, update, new FullDictBlockFactory()); HCDBG(std::cerr << "id is " << params.getFreeID() << std::endl); blocks.resize(params.getFreeID()); FullDictBlockProcessor foo(blocks); blockManager->processBlocks(foo); /* if (logging) log = new FileWriter("/tmp/FullBtreeDict.log"); */ } void FullBtreeDict::setModified(Block &bl) { blockManager->setModified(bl._number); } FullDictBlock& FullBtreeDict::getNewBlock() { FullDictBlock &nbl = (FullDictBlock&)blockManager->getNewBlock(); setModified(nbl); return nbl; } boost::shared_ptr FullBtreeDict::insertHere(FullDictBlock &bl, boost::shared_ptr ent) throw( HelpProcessingException ) { setModified(bl); // to be modified in any case if (bl.insert(ent)) return boost::shared_ptr(); else { FullDictBlock &nbl = getNewBlock(); boost::shared_ptr middle = bl.split(nbl); nbl.setBlockNumbers(blocks); if ((middle->smallerThan(*ent) ? nbl : bl).insert(ent) == false) { std::stringstream aStrStream; aStrStream << "entry didn't fit into a freshly split block" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } return middle; } } void FullDictBlock::makeEntry(int entry, const std::vector &key, int id, int length, int compr) { _data[entry] = static_cast(length); _data[entry + 1] = static_cast(compr); setEntryID(entry, id); memmove(&(_data[entryKey(entry)]), &(key[compr]), length); } int FullDictBlock::insertInternal(const Entry &entry) { const std::vector &inkey = entry.key; int inputKeyLen = inkey.size() - 1; int entryPtr = firstEntry(); int freeSpace = free(); int nCharsEqual = 0; int compression = 0; for (int entryIndex = 0;;) { if (entryPtr == freeSpace) return numberOfEntries(); else if (compression == nCharsEqual) { int i; int keyLen = entryKeyLength(entryPtr); int keyPtr = entryKey(entryPtr); for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) ++nCharsEqual; if (i == keyLen) { if (nCharsEqual == inputKeyLen) { setEntryID(entryPtr, entry.id); return -1; } } else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) return entryIndex; } else if (compression < nCharsEqual) // compression dropped return entryPtr >= freeSpace ? numberOfEntries() : entryIndex; do { entryPtr = nextEntry(entryPtr); ++entryIndex; } while (entryCompression(entryPtr) > nCharsEqual); compression = entryCompression(entryPtr); } } /* delegation to powerful primitives at the FullDictBlock level lets us express the insertion algorithm very succintly here */ boost::shared_ptr FullBtreeDict::insert(FullDictBlock &bl, boost::shared_ptr ent) { if (bl._isLeaf) ent = insertHere(bl, ent); else { int index = bl.insertInternal(*ent); if (index != -1) { ent = insert((FullDictBlock&)child(bl, index), ent); if (ent.get()) ent = insertHere(bl, ent); } } return ent; } void FullBtreeDict::store(const std::string &key, int id) throw( HelpProcessingException ) { HCDBG(std::cerr << "so storing " << key << " id " << id << std::endl); if (key.size() >= 250) { std::stringstream aStrStream; aStrStream << "token " << key << " too long" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } boost::shared_ptr aTemp(new Entry(key, id)); FullDictBlock &rBlock = (FullDictBlock&)accessBlock(root); boost::shared_ptr entry = insert(rBlock, aTemp); if (entry.get()) { // new root; writing to params needed FullDictBlock &nbl = getNewBlock(); nbl.initInternal(root, *entry); setBlockNumber2(blocks, entry->id, root = nbl._number); _params->setRoot(root); } } void FullBtreeDict::close(int freeID) { _params->setFreeID(freeID); if (update) _params->updateSchema(); BtreeDict::close(); /* if (logging) log.close(); */ } class ConceptLocation { public: int _concept; int _begin; int _end; public: ConceptLocation(int conceptID, int begin, int end); static void sortByConcept(std::vector &array, int i1, int i2); static void sortByPosition(std::vector &array, int i1, int i2); int getConcept() const { return _concept; } void setConcept(int concept) { _concept = concept; } int getBegin() const { return _begin; } int getEnd() const { return _end; } int getLength() const { return _end - _begin; } bool equals(const ConceptLocation &other) const { return _concept==other._concept&&_begin==other._begin&&_end==other._end; } }; class DocumentCompressor; class Index : public IndexAccessor { protected: typedef std::hash_map IndexHashtable; bool _update; IndexHashtable _cache; Schema *_schema; private: BtreeDictParameters *_dictParams; FullBtreeDict *_dict; int _freeID; std::fstream *_positionsFile; std::fstream *_offsetsFile; DocumentCompressor *_documentCompressor; IntegerArray _concepts; IntegerArray _offsets; std::vector _allLists; // POSITIONS void readDocumentsTable(const std::string &fileName); void readOffsetsTables(const std::string &fileName); void readPositions(); protected: IntegerArray _microIndexOffsets; IntegerArray _documents; IntegerArray _titles; std::vector _positions; private: int _positionsCacheSize; int _currentBatchOffset; bool _allInCache; protected: virtual void writeOutOffsets(); public: Index(const fs::path &indexName, bool update); virtual ~Index(); void init(); int intern(const std::string &name); std::fstream& getPositionsFile(); std::fstream& getOffsetsFile(); DocumentCompressor& getDocumentCompressor(); virtual void compress(int docID, int titleID, std::vector &locations, std::vector &extents); void close(); }; Index::Index(const fs::path &indexName, bool update) : IndexAccessor(indexName), _update(update), _cache(256), _schema(NULL), _dictParams(NULL), _dict(NULL), _positionsFile(0), _offsetsFile(0), _documentCompressor(0), _positionsCacheSize(0), _currentBatchOffset(0), _allInCache(false) { } class CompressorIterator; class Decompressor { private: static int BitsInByte; static int NBits; int _readByte; int _toRead; int _path; protected: virtual int getNextByte() = 0; virtual void initReading() { _toRead = 0; _path = 0; } private: int countZeroes(); // reads 1 bit; returns non-0 for bit "1" int read(); public: int read(int kBits); void beginIteration() { _path = 0; } bool readNext(int k, CompressorIterator &it); void decode(int k, IntegerArray &array); void ascDecode(int k, IntegerArray &array); int ascendingDecode(int k, int start, std::vector &array); virtual ~Decompressor() {} }; int Decompressor::BitsInByte = 8; int Decompressor::NBits = 32; class ByteArrayDecompressor : public Decompressor { private: const std::vector *_array; int _index; int _index0; public: ByteArrayDecompressor(const std::vector *array, int index) { initReading(array, index); } using Decompressor::initReading; virtual void initReading(const std::vector *array, int index) { _array = array; _index = _index0 = index; Decompressor::initReading(); } int bytesRead() { return _index - _index0; } protected: int getNextByte() { int ret = (*_array)[_index] & 0xFF; HCDBG(fprintf(stderr, "ByteArrayDecompressor::getNextByte of %d at index %d\n", ret, _index)); _index++; return ret; } }; bool isExtensionMode( void ); class IndexInverter; class MicroIndex { public: static int RANGE; static int NConcepts; private: int _currentRange; int _documentNumber; std::vector _concepts; short _group; short _ix; IntegerArray _kTable; IntegerArray _offsets; IntegerArray _maxConcepts; const std::vector *_data; int _base; int _limit; int _nc; ByteArrayDecompressor _decmp; public: MicroIndex(int documentNumber, const std::vector *positions, int index); bool smallerThan(const MicroIndex &other) { return _currentRange < other._currentRange || _currentRange == other._currentRange && _documentNumber < other._documentNumber; } private: bool next() { if (_group <= _limit) { int shift, index; if (_group > 0) { index = _base + _offsets[_group - 1]; shift = _maxConcepts[_group - 1]; } else { index = _base; shift = 0; } _decmp.initReading(_data, index); _nc = _decmp.ascendingDecode(_kTable[_group*2], shift, _concepts); HCDBG(std::cerr << "nc b set to " << _nc << std::endl); if (_group < _limit) { HCDBG(fprintf(stderr, "microindex concept index %d set to %d\n", _nc, _maxConcepts[_group])); _concepts[_nc++] = _maxConcepts[_group]; } _currentRange = _concepts[_ix = 0]/RANGE; _group++; return true; } else return false; } void openDocumentIndex() { unsigned int kk = (*_data)[_base] & 0xFF; HCDBG(std::cerr << "openDocumentIndex, kk is " << kk << " base is " << _base << std::endl); switch (kk >> 6) // get type { case 0: // single group, no extents _decmp.initReading(_data, _base += 2); _nc = _decmp.ascendingDecode(kk & 0x3F, 0, _concepts); HCDBG(std::cerr << "nc a set to " << _nc << std::endl); _currentRange = _concepts[_ix = 0]/RANGE; _limit = 0; _group = 1; break; case 2: // multi group, no extents { _decmp.initReading(_data, _base + 1); _decmp.decode(kk & 0x3F, _kTable); int last = _kTable.back(); _kTable.pop_back(); _decmp.ascDecode(last, _offsets); last = _kTable.back(); _kTable.pop_back(); _decmp.ascDecode(last, _maxConcepts); _base += 1 + _decmp.bytesRead(); _limit = _maxConcepts.size(); _group = 0; next(); } break; case 1: // single group, extents case 3: // multi group, extents if( !isExtensionMode() ) std::cerr << "extents not yet implemented" << std::endl; break; } } public: bool process(IndexInverter &lists); }; int MicroIndex::RANGE = 1024; int MicroIndex::NConcepts = 16; class BitBuffer { private: static int InitSize; static int NBits; static int BitsInByte; static int BytesInInt; int _avail; unsigned int _word; int _free; int _size; std::vector _array; public: BitBuffer() : _avail(NBits), _word(0), _free(0), _size(InitSize) { _array.resize(InitSize); } void close() { if (_avail < NBits) store(_word << _avail); else _avail = 0; } void write(std::fstream &out) const { for (int i = 0; i < _free - 1; i++) writeInt(out, _array[i]); unsigned int word = _array[_free - 1]; int bytes = BytesInInt - _avail/BitsInByte; int shift = NBits; while (bytes-- > 0) writeByte(out, static_cast((word >> (shift -= BitsInByte)) & 0xFF)); } void clear() { _word = 0; _avail = NBits; _free = 0; } int byteCount() { return _free*BytesInInt - _avail/BitsInByte; } int bitCount() { return _free*NBits - _avail; } void setFrom(const BitBuffer &rhs) { _word = rhs._word; _avail = rhs._avail; if ((_free = rhs._free) > _size) _array.resize(_size = rhs._free); _array = rhs._array; } private: void growArray(int newSize) { _array.resize(newSize); _size = newSize; } void store(unsigned int value) { if (_free == _size) growArray(_size * 2); HCDBG(fprintf(stderr, "store of %x to %d\n", (int)value, _free)); _array[_free++] = value; } public: void append(int bit) { _word = (_word << 1) | bit; if (--_avail == 0) { store(_word); _word = 0; _avail = NBits; } } void append(unsigned int source, int kBits) { if (kBits < _avail) { _word = (_word << kBits) | source; _avail -= kBits; } else if (kBits > _avail) { int leftover = kBits - _avail; store((_word << _avail) | (source >> leftover)); _word = source; _avail = NBits - leftover; } else { store((_word << kBits) | source); _word = 0; _avail = NBits; } } void concatenate(const BitBuffer &bb) { if (_size - _free < bb._free) growArray(_free + bb._free + 1); if (_avail == 0) { memmove(&_array[_free], &bb._array[0], bb._free * sizeof(unsigned int)); _avail = bb._avail; _free += bb._free; HCDBG(fprintf(stderr, "free bumped to %d\n", _free)); } else { int tp = _free - 1; // target int sp = 0; // source do { _array[tp] |= bb._array[sp] >> (NBits - _avail); _array[++tp] = bb._array[sp++] << _avail; } while (sp < bb._free); _free += bb._free; if ((_avail += bb._avail) >= NBits) { _avail -= NBits; _free--; } HCDBG(fprintf(stderr, "other free bumped to %d\n", _free)); } } }; class Compressor { private: static int NBits; static int BeginK; BitBuffer _buffer; public: void write(std::fstream &out) const { _buffer.write(out); } int byteCount() { return _buffer.byteCount(); } void clear() { _buffer.clear(); } void concatenate(const Compressor &other) { _buffer.concatenate(other._buffer); } void encode(const IntegerArray &pos, int k); void encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2); // k: starting value for minimization int minimize(const IntegerArray &array, int startK); int compressAscending(const IntegerArray &array); }; void toDifferences(const IntegerArray &in, IntegerArray &out) { if (out.size() < in.size()) out.resize(in.size()); if (in.empty()) return; out[0] = in[0]; for (size_t i = 1; i < in.size(); ++i) out[i] = in[i] - in[i - 1]; } class IndexInverter { private: static int K; std::vector _arrays; int _minConcept; int _limit; IntegerArray _concepts; IntegerArray _offsets; Compressor _compr; IntegerArray _diffs; std::fstream *_mainFile; // heap int _heapSize; std::vector _heap; Index &_index; public: IndexInverter(Index &index) : _arrays(MicroIndex::RANGE), _minConcept(0), _limit(MicroIndex::RANGE), _mainFile(0), _heapSize(0), _index(index) {} ~IndexInverter() { delete _mainFile; for (int i = 0; i < _heapSize; i++) { HCDBG(fprintf(stderr, "deleting number %d\n", i)); delete _heap[i]; } } void invertIndex(int nDocuments, const IntegerArray µIndexOffsets) { _mainFile = _index.getOutputStream("DOCS"); for (int i = 0; i < MicroIndex::RANGE; i++) _arrays[i] = IntegerArray(); // read in the whole POSITIONS file std::vector positions = _index.readByteArray("POSITIONS"); // build heap _heap.clear(); _heap.resize(_heapSize = nDocuments); for (int i = 0; i < nDocuments; i++) _heap[i] = new MicroIndex(i, &positions, microIndexOffsets[i]); for (int i = _heapSize/2; i >= 0; i--) heapify(i); // process till exhausted while (!_heap.empty()) if (_heap[0]->process(*this)) heapify(0); else if (_heapSize > 1) { delete _heap[0]; _heap[0] = _heap[--_heapSize]; heapify(0); } else break; // closing flush(); _mainFile->close(); // compress index file std::fstream *indexFile = _index.getOutputStream("DOCS.TAB"); unsigned char byte = static_cast( _compr.compressAscending(_concepts)); indexFile->write( (const char*)&byte, 1 ); // write k _compr.write(*indexFile); _compr.clear(); byte = static_cast(_compr.minimize(_offsets, K)); indexFile->write( (const char*)&byte, 1 ); // write k _compr.write(*indexFile); indexFile->close(); delete indexFile; } short process(int documentNumber, std::vector &concepts, int n, short start, bool firstTime) { if (firstTime && concepts[start] >= _limit) flush(); concepts[n] = _limit; // sentinel while (concepts[start] < _limit) { _arrays[concepts[start++] - _minConcept].push_back(documentNumber); } return start; } private: void heapify(int i) { int r = (i + 1) << 1, l = r - 1; int smallest = l < _heapSize && _heap[l]->smallerThan(*_heap[i]) ? l : i; if (r < _heapSize && _heap[r]->smallerThan(*_heap[smallest])) smallest = r; if (smallest != i) { MicroIndex *temp = _heap[smallest]; _heap[smallest] = _heap[i]; _heap[i] = temp; heapify(smallest); } } void flush() { for (int i = 0; i < MicroIndex::RANGE; ++i) { if (!_arrays[i].empty()) { toDifferences(_arrays[i], _diffs); unsigned char byte = static_cast( _compr.minimize(_diffs, K)); _mainFile->write( (const char*)&byte, 1 ); // write k _offsets.push_back(_compr.byteCount() + 1); _compr.write(*_mainFile); _concepts.push_back(_minConcept + i); _arrays[i].clear(); _diffs.clear(); _compr.clear(); } } _limit += MicroIndex::RANGE; _minConcept += MicroIndex::RANGE; } }; int IndexInverter::K = 3; MicroIndex::MicroIndex(int documentNumber, const std::vector *positions, int index) : _concepts(NConcepts + 1), _data(positions), _decmp(NULL, 0) { _documentNumber = documentNumber; _base = index; openDocumentIndex(); } bool MicroIndex::process(IndexInverter &lists) { bool firstTime = true; while (true) { short stop = lists.process(_documentNumber, _concepts, _nc, _ix, firstTime); if (stop < _nc) { _currentRange = _concepts[_ix = stop]/RANGE; return true; } else if (next()) firstTime = false; else return false; } } void Index::close() { /* BtreeDictCompactor source = new BtreeDictCompactor(_dictParams, false); URL url = new URL("file", "", _indexDir + "compacted"); BtreeDictParameters params = new BtreeDictParameters(url, _dictParams.getBlockSize(), 0, _freeID); source.compact(params); URL tmapURL = new URL("file", "", _indexDir + "DICTIONARY"); File tmap = new File(tmapURL.getFile()); File compacted = new File(url.getFile()); compacted.renameTo(tmap); _dictParams.setRoot(params.getRootPosition()); _dictParams.updateSchema(); */ _dict->close(_freeID); if (_positionsFile) { delete _positionsFile; _positionsFile = NULL; } if (_update) { writeOutOffsets(); _dictParams->setFreeID(_freeID); _dictParams->updateSchema(); _schema->save(); IndexInverter inverter(*this); inverter.invertIndex(_documents.size(), _microIndexOffsets); } if (_offsetsFile) { delete _offsetsFile; _offsetsFile = NULL; } } void Index::init() { bool indexExists = false; if (_update) { createIfNeeded(); _cache.clear(); } if (_schema) delete _schema; _schema = new Schema(*this, _update); if (_dictParams) delete _dictParams; _dictParams = new BtreeDictParameters(*_schema, "DICTIONARY"); if (_dictParams->readState() == false) { _dictParams->setBlockSize(2048); _dictParams->setRoot(0); _dictParams->setFreeID(1); } else indexExists = true; if (_dict) delete _dict; _dict = new FullBtreeDict(*_dictParams, _update); _freeID = _dictParams->getFreeID(); _documents.clear(); if (indexExists) { // read in index parts _allLists = readByteArray("DOCS"); readDocumentsTable("DOCS.TAB"); readOffsetsTables("OFFSETS"); readPositions(); } else { _microIndexOffsets.clear(); _titles.clear(); } } namespace { std::string cliptoken(const std::string &name) { std::string key = name; int length = key.size(); while(key.size() >= 250) key = name.substr(--length); return key; } } int Index::intern(const std::string &name) { std::string key = cliptoken(name); IndexHashtable::const_iterator aIter = _cache.find(key); if (aIter != _cache.end()) return aIter->second; else { //Seeing as we always start off with an empty dictionary, //our entries will always be in the _cache, so don't ever //search the underlying dictionary int id = _freeID++; _dict->store(key, id); _cache.insert(IndexHashtable::value_type(key, id)).first->second = id; return id; } } std::fstream& Index::getPositionsFile() { if (!_positionsFile) _positionsFile = getRAF("POSITIONS", _update); return *_positionsFile; } std::fstream& Index::getOffsetsFile() { if (!_offsetsFile) _offsetsFile = getRAF("OFFSETS", _update); return *_offsetsFile; } class VectorBtreeParameters : public BlockManagerParameters { private: int _vectorLength; public: VectorBtreeParameters(Schema &schema, const std::string &partName) : BlockManagerParameters(schema, partName) { _vectorLength = integerParameter("vl"); } void updateSchema() { std::ostringstream tmp; tmp << "vl=" << _vectorLength; BlockManagerParameters::updateSchema(tmp.str()); } VectorBtreeParameters(Schema &schema, const std::string &partName, int vecLen) : BlockManagerParameters(schema, partName) { _vectorLength = vecLen; } int getVectorLength() { return _vectorLength; } }; enum outerbreak { dobreak, docontinue, donothing }; class VectorProcessor { std::vector _vector; public: virtual bool processVector() = 0; std::vector& getVectorBuffer() { return _vector; } virtual ~VectorProcessor() {} }; class VectorBlock; class VectorBtree { protected: VectorBlock *_root; BlockManager *_blockManager; VectorBtreeParameters *_params; int _blockSize; public: int _maxEntries; int _leafDataLimit; protected: int _vectorsOffset; VectorBlock& accessBlock(int index); VectorBtree() {/*empty*/} public: int _vecLen; int vector(int index) const; static int memcmp(const std::vector &v1, const std::vector &v2, int i2, int n); VectorBtree(VectorBtreeParameters *params); ~VectorBtree() { delete _blockManager; } }; class VectorBlockFactory : public BlockFactory { private: int _blockSize; public: VectorBlockFactory(int blockSize) : _blockSize(blockSize) {} Block* makeBlock() const; }; VectorBtree::VectorBtree(VectorBtreeParameters *params) { _params = params; _vecLen = params->getVectorLength(); _blockSize = params->getBlockSize(); _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); if ((_maxEntries & 1) == 0) // needs to be odd _maxEntries--; _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; _blockManager = new BlockManager(_params, false, new VectorBlockFactory(_blockSize)); _root = &(accessBlock(params->getRootPosition())); } VectorBlock& VectorBtree::accessBlock(int index) { return (VectorBlock&)_blockManager->accessBlock(index); } int VectorBtree::memcmp(const std::vector &v1, const std::vector &v2, int i2, int n) { for (int i = 0; i < n; i++, i2++) if (v1[i] != v2[i2]) return (v1[i]&0xFF) - (v2[i2]&0xFF); return 0; } class VectorBlock : public Block { public: VectorBlock(int size) : Block(size) {} protected: int findIndex(const std::vector &key, const VectorBtree &tree) { int i = 0, j = _free - 1; while (i <= j) { int k = (i + j)/2; int test = VectorBtree::memcmp(key, _data, tree.vector(k),tree._vecLen); // std::cerr << "k = " << k << ", test = " << test << std::endl; if (test > 0) i = k + 1; else if (test < 0) j = k - 1; else return -1 - k; // result always negative; "k" encoded } return i; } private: int FindVectorsInLeaf(const std::vector &lo, const std::vector &hi, int commLen, int prefLen, std::vector &buffer, int size, const VectorBtree &tree) { int idx = 0, start; for (int nBytesEq = 0;;) { // std::cout << "idx = " << idx << std::endl; if (_data[idx] == nBytesEq) // at compression byte { int i; outerbreak hack(donothing); for (i = nBytesEq; i < tree._vecLen; i++) { if (lo[i] == _data[++idx]) ++nBytesEq; else if ((lo[i]&0xFF) < (_data[idx]&0xFF)) if (nBytesEq >= commLen && (i >= prefLen || (hi[i]&0xFF) >= (_data[idx]&0xFF))) { start = nBytesEq; hack = dobreak; break; } else return 0; else { idx += tree._vecLen - i; // skip hack = docontinue; break; } } if (hack == dobreak) break; else if (hack == docontinue) continue; if (i == tree._vecLen) // eq vec found if ((_data[++idx]&0xFF) >= prefLen) { start = _data[idx++]&0xFF; break; } else return 0; } else if (_data[idx] < nBytesEq) // drop { std::cout << idx << std::endl; nBytesEq = (_data[idx++]); std::cout << nBytesEq << std::endl; if (nBytesEq < commLen) return 0; else if (lo[nBytesEq] < (_data[idx]&0xFF)) if (hi[nBytesEq] < (_data[idx]&0xFF)) return 0; else { start = nBytesEq; // found break; } else idx += tree._vecLen - nBytesEq; } else if ((_data[idx]&0xFF) == 0xFF) return 0; else // compression is bigger idx += tree._vecLen + 1 - _data[idx]; } int length = std::min(size - start, _free - idx); buffer[0] = static_cast(start); memcpy(&(buffer[1]), &(_data[idx]), length); buffer[length + 1] = 0; return length + 1; } protected: bool searchLeafBlock(const std::vector &key, const VectorBtree &tree) { #if 0 processLeafBlock(_printer); #endif int nBytesEq = 0; for (int idx = 0;; idx += tree._vecLen + 1 - _data[idx]) { if (_data[idx] == nBytesEq) { int i, j; outerbreak hack(donothing); for (i = _data[idx], j = idx + 1; i < tree._vecLen; i++, j++) { if (key[i] == _data[j]) ++nBytesEq; else if ((key[i]&0xFF) < (_data[j]&0xFF)) return false; else /* key[i] > _data[j] */ { hack = dobreak; break; } } if (hack == dobreak) break; if (i == tree._vecLen) /* or nBytesEq == _vecLen */ return true; /* equal vector found */ } else if (_data[idx] < nBytesEq) return false; } return false; } public: bool processLeafBlock(VectorProcessor &processor, const VectorBtree &tree) { std::vector &buffer = processor.getVectorBuffer(); for (int ix = 0; ix < _free; ix += tree._vecLen - _data[ix] + 1) { // cmc: the below line was a comment in the original java, somewhere along // the line I suspect this was written in c++, then into java // and now I'm putting it back to c++ :-( // ::memcpy(&buffer[_data[ix]], &_data[ix + 1], _vecLen - _data[ix]); memcpy(&(buffer[_data[ix]]), &(_data[ix + 1]), tree._vecLen - _data[ix]); if (processor.processVector()) return true; } return false; } }; // VectorBlock Block* VectorBlockFactory::makeBlock() const { return new VectorBlock(_blockSize); } class FullVectorBlock : public VectorBlock { public: FullVectorBlock(int size) : VectorBlock(size) {} bool isFull(const VectorBtree &tree) const { //return pbl->_leaf ? pbl->_free > _leafDataLimit : pbl->_free == _maxEntries; return _isLeaf ? _free > tree._leafDataLimit : _free == tree._maxEntries; } }; class FullVectorBtree : public VectorBtree { private: static int MaxVeclen; static double SplitRatio; public: FullVectorBtree(VectorBtreeParameters* params, bool update); bool insertVector(const std::vector &key); private: bool treeInsertNonfull(const FullVectorBlock &bl, const std::vector &key); bool treeInsertNonfullRoot(const std::vector &key); FullVectorBlock& getNewBlock(); void enableModif(const Block &bl); void declareModif(const Block &bl); public: void close() { _blockManager->close(); } }; int FullVectorBtree::MaxVeclen = 128; double FullVectorBtree::SplitRatio = 0.5; class FullVectorBlockFactory : public BlockFactory { private: int _blockSize; public: FullVectorBlockFactory(int blockSize) : _blockSize(blockSize) {} Block* makeBlock() const { return new FullVectorBlock(_blockSize); } }; FullVectorBtree::FullVectorBtree(VectorBtreeParameters *params, bool update) { _params = params; _vecLen = params->getVectorLength(); _blockSize = params->getBlockSize(); _blockManager = new BlockManager(params, update, new FullVectorBlockFactory(_blockSize)); _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); // System.out.println("_maxEntries = " + _maxEntries); if ((_maxEntries & 1) == 0) // needs to be odd _maxEntries--; _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; _root = &(accessBlock(params->getRootPosition())); } class CompressorIterator { public: virtual void value(int value) = 0; virtual ~CompressorIterator() {} }; int Decompressor::countZeroes() { for (int count = 0;; _readByte = getNextByte(), _toRead = BitsInByte) { HCDBG(fprintf(stderr, "count is %d\n", count)); HCDBG(fprintf(stderr, "Decompressor::countZeroes is %x\n", _readByte)); HCDBG(fprintf(stderr, "_toRead is %d\n", _toRead)); HCDBG(fprintf(stderr, "_readByte is %x\n", _readByte)); while (_toRead-- > 0) { if ((_readByte & (1 << _toRead)) != 0) { HCDBG(fprintf(stderr, "returning count of %d\n", count)); return count; } else { ++count; HCDBG(fprintf(stderr, "int count to %d\n", count)); } } } //return 0; } // reads 1 bit; returns non-0 for bit "1" int Decompressor::read() { if (_toRead-- > 0) return _readByte & (1 << _toRead); else { // get next word _toRead = BitsInByte - 1; return (_readByte = getNextByte()) & 0x80; } } int Decompressor::read(int kBits) { int shift = BitsInByte - _toRead; if (kBits <= _toRead) { HCDBG(fprintf(stderr, "leg 1\n")); return ((_readByte<> (shift + (_toRead-=kBits)); } else { HCDBG(fprintf(stderr, "leg 2 _readByte is %d, shift %d\n", _readByte, shift)); int result = _toRead > 0 ? ((_readByte << shift) & 0xFF) >> shift : 0; HCDBG(fprintf(stderr, "result is %d\n", result)); for (kBits -= _toRead; kBits >= BitsInByte; kBits -= BitsInByte) { int foo = getNextByte(); HCDBG(fprintf(stderr, "byte is %d\n", foo)); result = (result << BitsInByte) | foo; HCDBG(fprintf(stderr, "and result is %d\n", result)); } if (kBits > 0) { int foo = getNextByte(); HCDBG(fprintf(stderr, "and byte is %d\n", foo)); int thing = BitsInByte - kBits; HCDBG(fprintf(stderr, "thing is %d\n", thing)); _toRead = thing; _readByte = foo; int right = (_readByte >> _toRead); HCDBG(fprintf(stderr, "right is %d\n", right)); int left = result << kBits; HCDBG(fprintf(stderr, "kbits are %d\n", kBits)); HCDBG(fprintf(stderr, "left is %d\n", left)); int ret = left | right; // int ret = (result << kBits) | ((_readByte = foo) >> (_toRead = BitsInByte - kBits)); HCDBG(fprintf(stderr, "and final is %d\n", ret)); return ret; } else { _toRead = 0; HCDBG(fprintf(stderr, "and this result says %d\n", result)); return result; } } } bool Decompressor::readNext(int k, CompressorIterator &it) { if (read() != 0) { it.value(_path | read(k)); return true; } else { for (int count = 1;; _readByte = getNextByte(), _toRead = BitsInByte) { while (_toRead-- > 0) { if ((_readByte & (1 << _toRead)) != 0) { int saved = _path; _path = ((_path >> (k + count) << count) | read(count)) << k; if (_path != saved) { it.value(_path | read(k)); return true; } else { return false; } } else { ++count; } } } } } void Decompressor::decode(int k, IntegerArray &array) { for (int path = 0;;) { if (read() != 0) { array.push_back(path | read(k)); } else { int count = countZeroes() + 1; int saved = path; path = ((path >> (k + count) << count) | read(count)) << k; if (path != saved) // convention for end array.push_back(path | read(k)); else break; } } } void Decompressor::ascDecode(int k, IntegerArray &array) { for (int path = 0, start = 0;;) { HCDBG(fprintf(stderr, "path is %d, start is %d\n", path, start)); if (read() != 0) { int inread = read(k); start += path | inread; HCDBG(fprintf(stderr, "inread is %d\n", inread)); int final = start; HCDBG(fprintf(stderr, "1:Decompressor::ascDecode to %d\n", final)); array.push_back(final); } else { int count = countZeroes() + 1; HCDBG(fprintf(stderr, "count is %d\n", count)); int saved = path; int inread = read(count); HCDBG(fprintf(stderr, "inread is %d, k is %d, path is %d\n", inread, k, path)); path = ((path >> (k + count) << count) | inread) << k; if (path != saved) // convention for end { int anotherread = read(k); HCDBG(fprintf(stderr, "newinread is %d\n", anotherread)); start += path | anotherread; int final = start; HCDBG(fprintf(stderr, "2:Decompressor::ascDecode to %d\n", final)); array.push_back(final); } else { break; } } } } int Decompressor::ascendingDecode(int k, int start, std::vector &array) { int path = 0, index = 0; while (true) { if (read() != 0) array[index++] = (start += path | read(k)); else { outerbreak hack = donothing; for (int cnt = 0;; _readByte = getNextByte(), _toRead = BitsInByte) { while (_toRead-- > 0) { if ((_readByte & (1 << _toRead)) != 0) { ++cnt; int Path = ((path >> (k + cnt) << cnt) | read(cnt)) << k; if (Path != path) { array[index++] = (start += (path = Path) | read(k)); hack = docontinue; break; } else return index; } else ++cnt; } if (hack == docontinue) break; } } } } class StreamDecompressor : public Decompressor { private: std::ifstream *_input; public: StreamDecompressor(std::ifstream &input) { initReading(input); } using Decompressor::initReading; virtual void initReading(std::ifstream &input) { _input = &input; Decompressor::initReading(); } int getNextByte() { unsigned char ret; _input->read( (char*)&ret, 1 ); HCDBG(fprintf(stderr, "StreamDecompressor::getNextByte of %d\n", ret)); return ret; } }; void Index::readPositions() { getPositionsFile(); //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt _positionsFile->seekg(0, std::ios::end); _positionsCacheSize = _positionsFile->tellg(); if (_positionsCacheSize < 0) _positionsCacheSize = 0; _positionsFile->clear(); _positionsFile->seekg(0, std::ios::beg); if (_positionsCacheSize <= _positionsCacheSize) { _allInCache = true; _positions.resize(_positionsCacheSize); _positionsFile->readsome((char*)(&_positions[0]), _positionsCacheSize); std::cout << "POS fits in cache" << std::endl; } } void Index::readOffsetsTables(const std::string &fileName) { std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); unsigned char k1; in.read( (char*)&k1, 1 ); StreamDecompressor sddocs(in); sddocs.decode(k1, _documents); unsigned char k2; in.read( (char*)&k2, 1 ); _microIndexOffsets.clear(); StreamDecompressor sdoffsets(in); sdoffsets.ascDecode(k2, _microIndexOffsets); // decompress titles' ids table unsigned char k3; in.read( (char*)&k3, 1 ); _titles.clear(); StreamDecompressor sdtitles(in); sdtitles.decode(k3, _titles); } void Index::readDocumentsTable(const std::string &fileName) { std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); unsigned char k1; in.read( (char*)&k1, 1 ); _concepts.clear(); StreamDecompressor sddocs(in); sddocs.ascDecode(k1, _concepts); unsigned char k2; in.read( (char*)&k2, 1 ); _offsets.clear(); _offsets.push_back(0); StreamDecompressor sdoffsets(in); sdoffsets.ascDecode(k2, _offsets); in.close(); } class ContextTables; class Tables { private: std::vector _initialWordsCached; std::vector _destsCached; std::vector _linkTypesCached; std::vector _seqNumbersCached; public: Tables(const std::vector &initialWords, std::vector &dests, std::vector &linkTypes, std::vector &seqNumbers) { _initialWordsCached = initialWords; _destsCached = dests; _linkTypesCached = linkTypes; _seqNumbersCached = seqNumbers; } void setTables(ContextTables &context); }; // end of Tables class ContextTables { public: std::vector _initialWords; std::vector _dests; std::vector _linkTypes; std::vector _seqNumbers; int _nTextNodes; private: std::vector _cache; // cached last position for linear search int _initialWordsIndex; // link names are shared between all microindexes in an index std::vector _linkNames; // offsets to tables' storage in file (or memory) std::vector _offsets; std::vector _contextData; // !!! fully cached for now // auxillary IntegerArray _kTable; // _auxArray will be used as an auxillary to decode arrays IntegerArray _auxArray; int _lastDocNo; std::vector _markers; public: ContextTables(const std::vector &offsets, const std::vector &contextData, const std::vector &linkNames); ~ContextTables(); void setMicroindex(int docNo); int parentContext(int context); const std::string& linkName(int context); int linkCode(const std::string &linkName); std::vector getIgnoredElementsSet(const std::vector &ignoredElements); bool notIgnored(int ctx, const std::vector &ignoredElements); int firstParentWithCode(int pos, int linkCode); int firstParentWithCode2(int pos, int linkCode, int parentCode); int firstParentWithCode3(int pos, int linkCode, int ancestorCode); int firstParentWithCode4(int pos, const std::vector &linkCodes); int firstParentWithCode5(int pos, const std::vector &pathCodes); int firstParentWithCode7(int pos, int linkCode, int seq); bool isGoverning(int context) { return linkName(context) == "TITLE"; } void resetContextSearch() { _initialWordsIndex = 0; } private: void appendSegment(int context, std::string &result); int findIndexBin(int wordNumber); public: int wordContextLin(int wordNumber); }; ContextTables::ContextTables(const std::vector &offsets, const std::vector &contextData, const std::vector &linkNames) : _kTable(5), _auxArray(4096), _lastDocNo(-1) { _offsets = offsets; _contextData = contextData; _linkNames = linkNames; _cache.resize(_offsets.size()); } ContextTables::~ContextTables() { for (size_t i = 0; i < _cache.size(); ++i) delete _cache[i]; } void ContextTables::setMicroindex(int docNo) { if (docNo != _lastDocNo) // check if we need to do anything { if (_cache[docNo]) _cache[docNo]->setTables(*this); else { int offset = _offsets[docNo]; int k0 = _contextData[offset] & 0xFF; ByteArrayDecompressor compr(&_contextData, offset + 1); _kTable.clear(); compr.decode(k0, _kTable); // decompress initialWords into auxiliary array _auxArray.clear(); compr.ascDecode(_kTable[0], _auxArray); // _initialWords _initialWords = _auxArray; _nTextNodes = _initialWords.size(); // decompress destinations into auxiliary array _auxArray.clear(); compr.decode(_kTable[1], _auxArray); // _dests _auxArray.push_back(-1); // sentinel, root _dests = _auxArray; _linkTypes.clear(); compr.decode(_kTable[2], _linkTypes); _seqNumbers.clear(); compr.decode(_kTable[3], _seqNumbers); _cache[docNo] = new Tables(_initialWords, _dests, _linkTypes, _seqNumbers); /* System.out.println("|_initialWords| = " + _nTextNodes); System.out.println("|_dests| -1 = " + (_dests.length - 1)); System.out.println("|_seqNumbers| = " + _seqNumbers.length); System.out.println("|_linkTypes| = " + _linkTypes.length); */ } _lastDocNo = docNo; _markers.resize(_dests.size()); } _initialWordsIndex = 0; } int ContextTables::parentContext(int context) { return _dests[context]; } const std::string& ContextTables::linkName(int context) { return _linkNames[_linkTypes[context]]; } int ContextTables::linkCode(const std::string &inlinkName) { for (size_t i = 0; i < _linkNames.size(); i++) if (inlinkName == _linkNames[i]) return i; return -1; // when not found } std::vector ContextTables::getIgnoredElementsSet(const std::vector &ignoredElements) { std::vector result; bool noValidIgnoredElements = true; if (!ignoredElements.empty()) { result.resize(_linkNames.size()); for (size_t i = 0; i < ignoredElements.size(); i++) { int code = linkCode(ignoredElements[i]); if (code > -1) { result[code] = true; noValidIgnoredElements = false; } } } return noValidIgnoredElements ? std::vector() : result; } bool ContextTables::notIgnored(int ctx, const std::vector &ignoredElements) { do { if (ignoredElements[_linkTypes[ctx]]) { std::cout << "hit ignored" << std::endl; return false; } } while ((ctx = _dests[ctx]) > -1); // parentContext 'hand inlined' return true; } /** starting with ctx and going up the ancestry tree look for the first context with the given linkCode */ int ContextTables::firstParentWithCode(int pos, int inlinkCode) { int ctx = _dests[wordContextLin(pos)]; // first parent of text node int shift = _nTextNodes; int limit = _dests.size() - 1; while (_linkTypes[ctx - shift] != inlinkCode) if ((ctx = _dests[ctx]) == limit) return -1; return ctx; } /** starting with ctx and going up the ancestry tree look for the first context with the given linkCode and given parent code */ int ContextTables::firstParentWithCode2(int pos, int inlinkCode, int parentCode) { int ctx = _dests[wordContextLin(pos)]; // first parent of text node int shift = _nTextNodes; int limit = _dests.size() - 1; for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) if (_linkTypes[parent - shift] == parentCode && _linkTypes[ctx - shift] == inlinkCode) return ctx; else ctx = parent; return -1; } /** starting with ctx and going up the ancestry tree look for the first context with the given linkCode and given ancestor code */ int ContextTables::firstParentWithCode3(int pos, int inlinkCode, int ancestorCode) { int ctx = _dests[wordContextLin(pos)]; int shift = _nTextNodes; int limit = _dests.size() - 1; // find first instance of linkCode while (ctx < limit && _linkTypes[ctx - shift] != inlinkCode) ctx = _dests[ctx]; if (ctx < limit) // found linkCode, check ancestry for (int ancestor = _dests[ctx]; ancestor < limit; ancestor = _dests[ancestor]) if (_linkTypes[ancestor - shift] == ancestorCode) // ancestor confirmed return ctx; // match found, return successful ctx return -1; // match NOT found } /** starting with ctx and going up the ancestry tree look for the first context with any of the given linkCode */ int ContextTables::firstParentWithCode4(int pos, const std::vector &linkCodes) { int nCodes = linkCodes.size(); int shift = _nTextNodes; int limit = _dests.size() - 1; for (int ctx = _dests[wordContextLin(pos)]; ctx < limit; ctx = _dests[ctx]) { int code = _linkTypes[ctx - shift]; for (int i = 0; i < nCodes; i++) if (code == linkCodes[i]) return ctx; } return -1; } /** starting with ctx and going up the ancestry tree look for the first context with the given path */ int ContextTables::firstParentWithCode5(int pos, const std::vector &pathCodes) { int nCodes = pathCodes.size(); int lastCode = pathCodes[nCodes - 1]; int shift = _nTextNodes; int limit = _dests.size() - 1; int ctx = _dests[wordContextLin(pos)]; for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) { if (_linkTypes[ctx - shift] == lastCode) { // try to match the entire path outerbreak hack = donothing; for (int i = nCodes - 2, parent2 = parent; i >= 0; i--) if (_linkTypes[parent2 - shift] != pathCodes[i]) // match failure { hack = docontinue; break; // try to match higher } else if ((parent2 = _dests[parent2]) == limit) return -1; if (hack == docontinue) continue; return ctx; } else ctx = parent; } return -1; } /** starting with ctx and going up the ancestry tree look for the first context with the given linkCode */ int ContextTables::firstParentWithCode7(int pos, int inlinkCode, int seq) { int ctx = _dests[wordContextLin(pos)]; // first parent of text node int shift = _nTextNodes; int limit = _dests.size() - 1; while (_linkTypes[ctx - shift] != inlinkCode || _seqNumbers[ctx] != seq) if ((ctx = _dests[ctx]) == limit) return -1; return ctx; } void ContextTables::appendSegment(int context, std::string &result) { result.append(context < _nTextNodes ? "text()" : _linkNames[_linkTypes[context - _nTextNodes]]); result.push_back('['); std::ostringstream tmp; tmp << _seqNumbers[context]; result.append(tmp.str()); result.append("]/"); } int ContextTables::findIndexBin(int wordNumber) { int i = 0, j = _nTextNodes - 1; while (i <= j) { int k = (i + j) >> 1; if (_initialWords[k] < wordNumber) i = k + 1; else if (_initialWords[k] > wordNumber) j = k - 1; else return k; } return i - 1; } int ContextTables::wordContextLin(int wordNumber) { for (int i = _initialWordsIndex; i < _nTextNodes; i++) if (_initialWords[i] > wordNumber) // first such i { // - 1 if wordNumbers can be the same _initialWordsIndex = i; // cached to speed up next search return i - 1; } return _nTextNodes - 1; } void Tables::setTables(ContextTables &context) { context._initialWords = _initialWordsCached; context._dests = _destsCached; context._linkTypes = _linkTypesCached; context._seqNumbers = _seqNumbersCached; context._nTextNodes = context._initialWords.size(); } class Compressor; class XmlIndex : public Index { private: VectorBtreeParameters *_edgesParams; FullVectorBtree *_edges; ContextTables *_contextTables; std::fstream *_contextsFile; IntegerArray _contextsOffsets; std::vector _contextsData; std::vector _linkNames; protected: virtual void writeOutOffsets(); public: XmlIndex(const fs::path &index, bool update) : Index(index, update), _edgesParams(0), _edges(0), _contextTables(0), _contextsFile(0) {} void init(); void close(); virtual ~XmlIndex() { delete _edgesParams; delete _edges; delete _contextTables; } std::fstream& getContextsFile(); using Index::compress; virtual void compress(int docID, int titleID, std::vector &locations, std::vector &extents, int k, const Compressor &contextTables); const std::vector& getLinkNames() { return _linkNames; } }; void XmlIndex::init() { Index::init(); if (_edgesParams) delete _edgesParams; _edgesParams = new VectorBtreeParameters(*_schema, "EDGE", 9); if (_edgesParams->readState() == false) _edgesParams->setBlockSize(1024); _edges = new FullVectorBtree(_edgesParams, _update); if (!_contextsOffsets.empty()) { _contextsData = readByteArray("CONTEXTS"); #if 0 _linkNames = (String[])readObject("LINKNAMES"); #endif _contextTables = new ContextTables(_contextsOffsets, _contextsData, _linkNames); } } void XmlIndex::writeOutOffsets() { Index::writeOutOffsets(); if (!_contextsOffsets.empty()) { std::fstream &out = getOffsetsFile(); Compressor offsets2; char k = static_cast(offsets2.compressAscending(_contextsOffsets)); out.write( (const char*)&k, 1 ); offsets2.write(out); } } std::fstream& XmlIndex::getContextsFile() { if (!_contextsFile) _contextsFile = getRAF("CONTEXTS", _update); return *_contextsFile; } void XmlIndex::close() { if (_contextsFile) { _contextsFile->close(); delete _contextsFile; _contextsFile = 0; } _edges->close(); if (_update) _edgesParams->updateSchema(); Index::close(); } class Tokenizer { private: UnicodeString s; BreakIterator *bi; int32_t start; UConverter *utf8; std::vector utfbuffer; public: Tokenizer(); ~Tokenizer(); void setText(const xmlChar *text); std::string nextToken(); }; Tokenizer::Tokenizer() : start(BreakIterator::DONE), utfbuffer(64) { UErrorCode status = U_ZERO_ERROR; bi = BreakIterator::createWordInstance("en_US", status); utf8 = ucnv_open("utf-8", &status); } Tokenizer::~Tokenizer() { #if !defined(SOLARIS) delete bi; ucnv_close(utf8); #endif } void Tokenizer::setText(const xmlChar *text) { UErrorCode status = U_ZERO_ERROR; s = UnicodeString((const char*)text, -1, utf8, status); bi->setText(s); start = ubrk_first(bi); } std::string Tokenizer::nextToken() { std::string ret; int32_t end = ubrk_next(bi); while (end != BreakIterator::DONE) { if (ubrk_getRuleStatus(bi) != UBRK_WORD_NONE) break; start = end; end = ubrk_next(bi); } if (end != -1 && end != start) { UnicodeString token(s, start, end-start); token = token.toLower(); size_t needed = 0; UErrorCode status = U_ZERO_ERROR; while ((needed = token.extract(&utfbuffer[0], utfbuffer.size(), utf8, status)) > utfbuffer.size()) utfbuffer.resize(utfbuffer.size() * 2); ret = std::string(&utfbuffer[0], needed); start = end; } return ret; } typedef std::vector Vector; ConceptLocation::ConceptLocation(int conceptID, int begin, int end) : _concept(conceptID), _begin(begin), _end(end) { } #ifdef EMULATEORIGINALSORT class ConceptLocationSorter { public: virtual bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) = 0; private: // part of quicksearch int partition(std::vector &array, int p, int r) { ConceptLocation x = array[(p + r)/2]; int i = p - 1, j = r + 1; while (true) { while (smallerThan(x, array[--j])) ; while (smallerThan(array[++i], x)) ; if (i < j) { ConceptLocation t = array[i]; array[i] = array[j]; array[j] = t; } else return j; } } public: void quicksort(std::vector &array, int p, int r) { while (p < r) { int q = partition(array, p, r); quicksort(array, p, q); p = q + 1; } } }; class ConceptSorter : public ConceptLocationSorter { public: bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) { return a._concept < b._concept; } }; class PositionSorter : public ConceptLocationSorter { public: bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) { return a._begin < b._begin || a._begin == b._begin && a._end < b._end; } }; #else class ConceptSorter { public: bool operator()(const ConceptLocation &a, const ConceptLocation &b) const { return a._concept < b._concept; } }; class PositionSorter { public: bool operator()(const ConceptLocation &a, const ConceptLocation &b) const { return a._begin < b._begin || (a._begin == b._begin && a._end < b._end); } }; #endif void ConceptLocation::sortByPosition(std::vector &array, int i1, int i2) { #ifdef EMULATEORIGINALSORT PositionSorter _pComp; _pComp.quicksort(array, i1, i2 - 1); #else std::vector::iterator begin = array.begin(); std::vector::iterator end = begin; std::advance(begin, i1); std::advance(end, i2); std::sort(begin, end, PositionSorter()); #endif } void ConceptLocation::sortByConcept(std::vector &array, int i1, int i2) { #ifdef EMULATEORIGINALSORT ConceptSorter _cComp; _cComp.quicksort(array, i1, i2 - 1); #else std::vector::iterator begin = array.begin(); std::vector::iterator end = begin; std::advance(begin, i1); std::advance(end, i2); std::sort(begin, end, ConceptSorter()); #endif } typedef std::map NodeHashtable; typedef std::hash_map LinkHashTable; class IndexAdapter { private: static int StackSize; const char* _indexText_Name; const char* _indexElement_Name; const char* _indexAttribute_Name; const char* _nodeID_Name; const char* _tokenizer_Name; const char* _attributeName_Name; std::vector _indexOnOffStack; int _sp; int _tsp; std::vector< std::string > _attributeStack; xmlNodePtr _currentNode; int _attrSP; void storeLocation(const std::string &token, int number); void storeLocation(const std::string &token) { storeLocation(token, _lastWordNumber++); } void storeEdge(int relation, int seqNumber, int destination); void startElement(xmlNodePtr node); void attribute(const char *name, const char *value); void characters(const xmlChar *str) throw( HelpProcessingException ); void endElement(xmlNodePtr node); void indexText(const xmlChar *str); Vector _textNodes; NodeHashtable _numberedNodes; public: HashSet _stoplist; LinkHashTable _linkCodes; std::vector _linknames; static int CurrenMaxLinkCode; std::vector _locations; int _availContextNumber; IntegerArray _initialWords; IntegerArray _links; IntegerArray _dests; IntegerArray _seqNumbers; int _lastWordNumber; int _firstWord; bool _anyLocationsStored; XmlIndex *_index; private: static int InitSize; int _size; public: IndexAdapter(); void process(xmlNodePtr node, xmlDocPtr doc); void init(); void finish(); int intern(const std::string &name) { return _index->intern(name); } int getLinkCode(const std::string &linkName); }; int IndexAdapter::StackSize = 64; int IndexAdapter::InitSize = 4096; int IndexAdapter::CurrenMaxLinkCode = 0; IndexAdapter::IndexAdapter() : _indexOnOffStack(StackSize), _attributeStack(StackSize), _anyLocationsStored(false), _size(InitSize) { _indexText_Name = "text"; _indexElement_Name = "element"; _indexAttribute_Name = "attribute"; _nodeID_Name = "nodeID"; _tokenizer_Name = "tokenizer"; _attributeName_Name = "attributeName"; } void IndexAdapter::storeLocation(const std::string &token, int number) { int concept = intern(token); HCDBG(std::cerr << "storeLocation of number " << number << "for token " << token << " as conceptlocation " << concept << std::endl); _locations.push_back(ConceptLocation(concept, number, number)); } void IndexAdapter::storeEdge(int relation, int seqNumber, int destination) { _links.push_back(relation); _seqNumbers.push_back(seqNumber); _dests.push_back(destination); HCDBG(std::cerr << "storeEdge" << std::endl); } void IndexAdapter::finish() { _numberedNodes.clear(); _dests.clear(); _seqNumbers.clear(); _links.clear(); int nTextNodes = _textNodes.size(); _availContextNumber = nTextNodes; // vector to hold parents of text nodes Vector parents; /***** for each of the text nodes its sequence number is stored as well as the index of its parent (in _dests) _link is not stored as it is always "text()" _availContextNumber only used to number parent element contexts ******/ for (int i = 0; i < nTextNodes; i++) { xmlNodePtr node = _textNodes[i]; xmlNodePtr parent = node->parent; // find this text node's seq number int counter = 1; xmlNodePtr sibling = parent->xmlChildrenNode; while (sibling && sibling != node) { if (xmlNodeIsText(sibling)) ++counter; sibling = sibling->next; } _seqNumbers.push_back(counter); // check whether parent already encountered NodeHashtable::const_iterator number = _numberedNodes.find(parent); if (number == _numberedNodes.end()) // not yet seen { int newContext = _availContextNumber++; _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; _dests.push_back(newContext); // enqueue parent: its parent will need a number too parents.push_back(parent); // System.out.println(parent.getName().toString() + // " -> " + newContext); } else { _dests.push_back(number->second); } } // end for _textNodes.clear(); // store info about element ancestry of the above text nodes // grandparents are added to the end of the vector int rootElementPos = 0; for (size_t i = 0; i < parents.size(); i++) { xmlNodePtr node = parents[i]; std::string name((const char*)(node->name)); xmlNodePtr parent = node->parent; _links.push_back(getLinkCode(name)); // if (parent.getType() == Node.ELEMENT) // not ROOT if (parent && parent->parent) // not ROOT { // find sequence number xmlNodePtr sibling = parent->xmlChildrenNode; int counter = 1; while (sibling && sibling != node) { if (strcmp((const char*)sibling->name, (const char*)name.c_str()) == 0) ++counter; sibling = sibling->next; } _seqNumbers.push_back(counter); // check whether parent already known NodeHashtable::iterator number = _numberedNodes.find(parent); if (number == _numberedNodes.end()) { int newContext = _availContextNumber++; _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; _dests.push_back(newContext); // enqueue parent: its parent will need a number too parents.push_back(parent); //System.out.println(parent.getName().toString() + // " -> " + newContext); } else { _dests.push_back(number->second); } } else { _dests.push_back(0); // placeholder _seqNumbers.push_back(1); rootElementPos = i + nTextNodes; // System.out.println("rootElementPos = " + i); } } // end for if (_dests.empty()) _dests.push_back(0); // index to sentinel _dests[rootElementPos] = _availContextNumber; } // end public void finish void IndexAdapter::init() { _sp = -1; _tsp = -1; _attrSP = -1; _lastWordNumber = 0; _anyLocationsStored = false; _availContextNumber = 0; // all the contexts' tables _initialWords.clear(); _locations.clear(); } void IndexAdapter::attribute(const char *name, const char *value) { HCDBG(std::cerr << "attribute: " << name << " = " << value << std::endl); if (strcmp(name, _nodeID_Name) == 0) _currentNode = (xmlNodePtr)(strtol(value, NULL, 10)); else if (strcmp(name, _tokenizer_Name) == 0) { if (strcmp(value, "com.sun.xmlsearch.util.SimpleTokenizer") != 0 && !isExtensionMode() ) std::cerr << "changing tokenizers not implemented in C++ version of HelpLinker" << " because no other tokenizers were referenced in the helpcontent2 source" << std::endl; } else if (strcmp(name, _attributeName_Name) == 0) { //namespace prefix ? std::string attrVal = std::string("index:") + value; if( !isExtensionMode() ) std::cout << "attrVal = " << attrVal << std::endl; _attributeStack[_attrSP] = std::string(name) + '<' + value + '<' + attrVal; storeLocation("+<" + _attributeStack[_attrSP]); } } void IndexAdapter::indexText(const xmlChar *text) { static Tokenizer tokenizer; tokenizer.setText(text); _firstWord = _lastWordNumber; _anyLocationsStored = false; std::string lowercaseToken = tokenizer.nextToken(); while (!lowercaseToken.empty()) { HCDBG(std::cerr << "token is: " << lowercaseToken << std::endl); #ifdef EMULATEORIGINAL if ((lowercaseToken.size() == 1) && isdigit(lowercaseToken[0])) { lowercaseToken = tokenizer.nextToken(); continue; } #endif if (std::find(_stoplist.begin(), _stoplist.end(), lowercaseToken) == _stoplist.end()) { storeLocation(lowercaseToken); _anyLocationsStored = true; } else _lastWordNumber++; lowercaseToken = tokenizer.nextToken(); } if (_anyLocationsStored && _firstWord > -1) { _initialWords.push_back(_firstWord); HCDBG(std::cerr << "appending " << _firstWord << std::endl); _textNodes.push_back(_currentNode); } // reset before next batch _firstWord = -1; } void IndexAdapter::characters(const xmlChar *str) throw( HelpProcessingException ) { if (!str) { std::stringstream aStrStream; aStrStream << "no characters!" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } HCDBG(std::cerr << "IndexAdapter::characters of " << str << std::endl); HCDBG(std::cerr << _sp << " : " << _indexOnOffStack[_sp] << std::endl); if (_sp >= 0 && _indexOnOffStack[_sp]) { indexText( str ); } } void IndexAdapter::startElement(xmlNodePtr node) { const char *name = (const char*)(node->name); HCDBG(std::cerr << "startElement is " << name << std::endl); if (strcmp(name, _indexElement_Name) == 0) { _indexOnOffStack[++_sp] = true; // pop Tokenizer stack // following attribute can push selected Tokenizer if (_tsp != -1) _tsp--; } else if (strcmp(name, _indexText_Name) == 0) { } else if (strcmp(name, _indexAttribute_Name) == 0) { _attrSP++; } } void IndexAdapter::endElement(xmlNodePtr node) { const char *name = (const char*)(node->name); HCDBG(std::cerr << "endElement is " << name << std::endl); if (strcmp(name, _indexElement_Name) == 0) _sp--; else if (strcmp(name, _indexText_Name) == 0) { // reset } else if (strcmp(name, _indexAttribute_Name) == 0) storeLocation("-<" + _attributeStack[_attrSP--]); } int IndexAdapter::getLinkCode(const std::string &linkName) { LinkHashTable::iterator code = _linkCodes.find(linkName); if (code != _linkCodes.end()) return code->second; else { _linknames.push_back(linkName); int newCode = CurrenMaxLinkCode++; _linkCodes.insert(LinkHashTable::value_type(linkName, newCode)).first->second = newCode; return newCode; } } void IndexAdapter::process(xmlNodePtr node, xmlDocPtr doc) { startElement(node); for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) { xmlChar *value = xmlNodeListGetString(doc, attr->children, 0); attribute((const char*)(attr->name), (const char*)value); xmlFree(value); } if (xmlNodeIsText(node)) { xmlChar *str = xmlNodeListGetString(doc, node, 1); characters(str); xmlFree(str); } for (xmlNodePtr test = node->xmlChildrenNode; test; test = test->next) process(test, doc); endElement(node); } class XmlIndexBuilder { private: fs::path _transformLocation; xsltStylesheetPtr _indexingTransform; IndexAdapter _indexAdapter; int _currentDocID; void reset(); xsltStylesheetPtr getTransform(const std::string &stylesheetName); public: XmlIndexBuilder() : _indexingTransform(0) {} XmlIndexBuilder(const fs::path &dir); ~XmlIndexBuilder(); void clearIndex(); void setTransformLocation(const fs::path &filelocation); void init(const std::string &transform); void initXmlProcessor(const std::string &transform); void indexDocument(xmlDocPtr document, const std::string &docURL, const std::string &title); int intern(const std::string &name); void openDocument(const std::string &name) throw( HelpProcessingException ); void closeDocument(const std::string &name) throw( HelpProcessingException ); void close(); }; void XmlIndexBuilder::close() { fs::path fullname = _indexAdapter._index->indexFile("LINKNAMES"); std::fstream _linkFile(fullname.native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); #ifdef EMULATEORIGINAL static const unsigned char vectorheader[] = { 0xAC, 0xED, 0x00, 0x05, 0x75, 0x72, 0x00, 0x13, 0x5B, 0x4C, 0x6A, 0x61, 0x76, 0x61, 0x2E, 0x6C, 0x61, 0x6E, 0x67, 0x2E, 0x53, 0x74, 0x72, 0x69, 0x6E, 0x67, 0x3B, 0xAD, 0xD2, 0x56, 0xE7, 0xE9, 0x1D, 0x7B, 0x47, 0x02, 0x00, 0x00, 0x78, 0x70 }; _linkFile.write((const char*)(&vectorheader[0]), sizeof(vectorheader)); writeInt(_linkFile, _indexAdapter._linknames.size()); std::vector::iterator aEnd = _indexAdapter._linknames.end(); for (std::vector::iterator aIter = _indexAdapter._linknames.begin(); aIter != aEnd; ++aIter) { HCDBG(std::cerr << "linkname is " << *aIter << std::endl); _linkFile << 't'; writeShort(_linkFile, aIter->size()); _linkFile << *aIter; } #else std::vector::iterator aEnd = _indexAdapter._linknames.end(); for (std::vector::iterator aIter = _indexAdapter._linknames.begin(); aIter != aEnd; ++aIter) { _linkFile << *aIter << '\n'; } #endif #if 0 // output link codes /* Enumeration keys = _linknames.elements(); while (keys.hasMoreElements()) System.out.println((String)keys.nextElement()); */ #endif _indexAdapter._index->close(); std::cout << "done" << std::endl; } int XmlIndexBuilder::intern(const std::string &name) { return _indexAdapter.intern(name); } void XmlIndexBuilder::openDocument(const std::string &name) throw( HelpProcessingException ) { if (_currentDocID != 0) { std::stringstream aStrStream; aStrStream << "document already open" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } _currentDocID = intern( PrefixTranslator::translatePrefix(name) ); reset(); // reset context gathering state } int BitBuffer::InitSize = 256; int BitBuffer::NBits = 32; int BitBuffer::BitsInByte = 8; int BitBuffer::BytesInInt = 4; void Compressor::encode(const IntegerArray &pos, int k) { HCDBG(std::cerr << "1:start this encode of " << k << "size of " << pos.size() << std::endl); unsigned int n1 = 0; unsigned int power = 1 << k; for (size_t i = 0; i < pos.size(); i++) { HCDBG(std::cerr << "1: loop " << i << std::endl); unsigned int n2 = pos[i] >> k; int rem = pos[i] % power; HCDBG(std::cerr << "1: n1, n2 : " << n1 << "," << n2 << std::endl); if (n2 != n1) { unsigned int min = n1; unsigned int a = n1; int lev = 0, power2 = 1; if (n2 > n1) for (size_t max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) if ((a & 1) != 0) min -= power2; else max += power2; else for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) if ((a & 1) != 0) min -= power2; // lev 0s, 1, lev bits of (n2 - min) plus following value // no 'V' symbol needed here if (lev*2 + 1 + k <= NBits) _buffer.append((1<> k; int rem = pos[i] % power; HCDBG(std::cerr << "2: n1, n2 : " << n1 << "," << n2 << std::endl); if (n2 != n1) { int min = n1, a = n1; int lev = 0, power2 = 1; if (n2 > n1) for (int max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) if ((a & 1) != 0) min -= power2; else max += power2; else for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) if ((a & 1) != 0) min -= power2; // lev 0s, 1, lev bits of (n2 - min) plus following value if (lev*2 + 1 + k <= NBits) _buffer.append((1< 0; k--) { _buffer.clear(); encode(array, k); if (_buffer.bitCount() < min) { saved.setFrom(_buffer); min = _buffer.bitCount(); minK = k; } else break; } } _buffer.setFrom(saved); return minK; } int Compressor::compressAscending(const IntegerArray &array) { IntegerArray differences(array.size()); toDifferences(array, differences); return minimize(differences, BeginK); } int Compressor::NBits = 32; int Compressor::BeginK = 5; class DocumentCompressor { public: static int NConceptsInGroup; static int BitsInLabel; static int DefaultSize; private: int _nGroups; int _nExtents; unsigned int _freeComp; int _kk; Compressor *_currentCompressor; std::vector _compressors; Compressor _kCompr; Compressor _lCompr; Compressor _mCompr; Compressor _posCompressor; IntegerArray _kTable; // k's for the series IntegerArray _lTable; // lengths of the C/P groups IntegerArray _maxConcepts; // maximal concepts in CP IntegerArray _concepts; IntegerArray _documents; IntegerArray _microIndexOffsets; IntegerArray _titles; // _contextsOffsets for use in XML indexing IntegerArray _contextsOffsets; IntegerArray _positions; IntegerArray _labels; public: DocumentCompressor() : _currentCompressor(0), _compressors(DefaultSize) {} void writeOutMicroIndex(std::fstream &output, std::vector &locations, std::vector &extents) { HCDBG(std::cerr << "writeOutMicroIndex start" << std::endl); encode(locations, NConceptsInGroup); HCDBG(std::cerr << "writeOutMicroIndex end encode" << std::endl); if (!extents.empty()) encodeExtents(extents); HCDBG(std::cerr << "writeOutMicroIndex finalize" << std::endl); finalizeEncoding(); HCDBG(std::cerr << "writeOutMicroIndex write" << std::endl); writeOut(output); HCDBG(std::cerr << "writeOutMicroIndex end" << std::endl); } private: void encode(std::vector &locations, int nConcepts) { int initK = 4; // first sort by concept only #ifdef CMCDEBUG for (size_t i = 0; i < locations.size(); ++i) fprintf(stderr, "unsorted is %d\n", locations[i].getConcept()); #endif HCDBG(std::cerr << "start sort" << std::endl); ConceptLocation::sortByConcept(locations, 0, locations.size()); HCDBG(std::cerr << "end sort" << std::endl); #ifdef CMCDEBUG for (size_t i = 0; i < locations.size(); ++i) fprintf(stderr, "sorted is %d\n", locations[i].getConcept()); #endif // using the fact that concepts are already sorted // count of groups of 'nConcepts' // go for differences directly // clear the state _nGroups = 0; _nExtents = 0; _kTable.clear(); _lTable.clear(); _concepts.clear(); _maxConcepts.clear(); _kCompr.clear(); _lCompr.clear(); _mCompr.clear(); for (size_t i = 0; i < _compressors.size(); i++) _compressors[i].clear(); _freeComp = 0; _currentCompressor = NULL; // end of resetting state int conceptCounter = 0; int fromIndex = 0; int prevMax = 0; int last = locations[0].getConcept(); // init w/ first ID nextCompressor(); _concepts.push_back(last); for (size_t i = 0;;) { for (; i < locations.size() && locations[i].getConcept() == last; i++) locations[i].setConcept(conceptCounter); if (i == locations.size()) { if (!_concepts.empty()) { ++_nGroups; _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); } encodePositions(locations, fromIndex, i, BitsInLabel); break; } else { // new concept (group?) if (++conceptCounter == nConcepts) { ++_nGroups; // we are looking at the beginning of a new group // last is maximal for the group just finished // it won't be stored in concepts array but maxConcepts _concepts.pop_back(); HCDBG(fprintf(stderr, "_maxConcepts %d %d -> %d\n", last, prevMax, last - prevMax)); _maxConcepts.push_back(last - prevMax); prevMax = last; _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); #ifdef CMCDEBUG for(size_t p = 0; p < locations.size(); ++p) std::cerr << "microindex2 this testing is " << locations[p].getBegin() << locations[p].getEnd() << " : " << locations[p].getConcept() << std::endl; #endif HCDBG(std::cerr << "two encodePositions " << fromIndex << " " << i << std::endl); encodePositions(locations, fromIndex, i, BitsInLabel); fromIndex = i; nextCompressor(); _concepts.clear(); conceptCounter = 0; } _concepts.push_back(locations[i].getConcept() - last); last = locations[i].getConcept(); } } } void encodePositions(std::vector &locations, int from, int to, int cK) { int initK = 3; int lastPos, k; // sort in place by psitions only #ifdef CMCDEBUG for (int i = from; i < to; ++i) fprintf(stderr, "unsorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); #endif ConceptLocation::sortByPosition(locations, from, to); #ifdef CMCDEBUG for (int i = from; i < to; ++i) fprintf(stderr, "sorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); #endif _positions.clear(); _labels.clear(); _positions.push_back(lastPos = locations[from].getBegin()); _labels.push_back(locations[from].getConcept()); // now: a label // skip duplicates for (int i = from, j = from + 1; j < to; j++) { if (locations[i].equals(locations[j]) == false) { i = j; HCDBG(std::cerr << "i is " << i << "locations begin is " << locations[i].getBegin() << "last pos is " << lastPos << std::endl); _positions.push_back(locations[i].getBegin() - lastPos); lastPos = locations[i].getBegin(); _labels.push_back(locations[i].getConcept()); // now: a label } } // first find k by minimizing just positions w/o labels _kTable.push_back(k = _posCompressor.minimize(_positions, initK)); _posCompressor.clear(); HCDBG(std::cerr << "start encodePositions" << std::endl); _posCompressor.encode(_positions, _labels, k, cK); HCDBG(std::cerr << "end encodePositions" << std::endl); _currentCompressor->concatenate(_posCompressor); } void encodeExtents(std::vector &extents) { // side effects: // 'k3' added to _kTable // a number of compressors populated: header + lengths' lists int initK = 4; int c = 0; IntegerArray concepts; //difference IntegerArray lengths; IntegerArray kTable; IntegerArray lTable; // reserve a compressor for concatenated tables nextCompressor(); Compressor *extentsHeader = _currentCompressor; std::vector::const_iterator aEnd = extents.end(); for (std::vector::const_iterator aIter = extents.begin(); aIter != aEnd; ++aIter) { if (aIter->getConcept() != c) { if (c != 0) { _nExtents++; nextCompressor(); kTable.push_back(_currentCompressor->minimize(lengths, initK)); lTable.push_back(_currentCompressor->byteCount()); } concepts.push_back(aIter->getConcept() - c); c = aIter->getConcept(); lengths.clear(); lengths.push_back(aIter->getLength()); } else lengths.push_back(aIter->getLength()); } // last table of lengths nextCompressor(); kTable.push_back(_currentCompressor->minimize(lengths, initK)); lTable.push_back(_currentCompressor->byteCount()); Compressor compressor1; kTable.push_back(compressor1.minimize(lTable, initK)); Compressor compressor2; kTable.push_back(compressor2.minimize(concepts, initK)); _kTable.push_back(extentsHeader->minimize(kTable, initK)); // k3 extentsHeader->concatenate(compressor1); extentsHeader->concatenate(compressor2); } void finalizeEncoding() { if (_nGroups > 1) { // if extents follow C/P groups we need the length of the last group int limit = _nExtents > 0 ? _freeComp : _freeComp - 1; for (int j = 0; j < limit; j++) // length of last not saved _lTable.push_back(_compressors[j].byteCount()); _kTable.push_back(_mCompr.minimize(_maxConcepts, 3)); _kTable.push_back(_lCompr.minimize(_lTable, 3)); _kk = _kCompr.minimize(_kTable, 3); _kCompr.concatenate(_lCompr); _kCompr.concatenate(_mCompr); } else if (_nGroups == 1 && _nExtents > 0) { // length of the single C/P group packed with k-s _kTable.push_back(_compressors[0].byteCount()); _kk = _kCompr.minimize(_kTable, 3); } } void writeOut(std::fstream &out) { if (_nExtents == 0) { if (_nGroups > 1) { unsigned char byte = static_cast((0x80 | _kk)); out.write( (const char*)&byte, 1 ); HCDBG(std::cerr << "writeOut of " << int(byte) << std::endl); _kCompr.write(out); // concatenated k,l,m for (size_t j = 0; j < _freeComp; j++) _compressors[j].write(out); } else // single group, no extents; code: 00 { unsigned char k1 = (unsigned char)(_kTable[0]); unsigned char k2 = (unsigned char)(_kTable[1]); out.write( (const char*)&k1, 1 ); out.write( (const char*)&k2, 1 ); _compressors[0].write(out); // C/P } } else { // extents unsigned char byte = static_cast( (_nGroups > 1 ? 0xC0 : 0x40) | _kk); out.write( (const char*)&byte, 1 ); _kCompr.write(out); for (size_t j = 0; j < _freeComp; j++) _compressors[j].write(out); } } Compressor* nextCompressor() { if (_freeComp == _compressors.size()) _compressors.push_back(Compressor()); return _currentCompressor = &_compressors[_freeComp++]; } int byteCount() { if (_nGroups == 1 && _nExtents == 0) return 2 + _compressors[0].byteCount(); else { int result = 1; // initial kk result += _kCompr.byteCount(); for (size_t j = 0; j < _freeComp; j++) result += _compressors[j].byteCount(); return result; } } }; int DocumentCompressor::NConceptsInGroup = 16; int DocumentCompressor::BitsInLabel = 4; int DocumentCompressor::DefaultSize = 32; DocumentCompressor& Index::getDocumentCompressor() { if (!_documentCompressor) _documentCompressor = new DocumentCompressor(); return *_documentCompressor; } void Index::compress(int docID, int titleID, std::vector &locations, std::vector &extents) { std::fstream &positions = getPositionsFile(); positions.seekg(0, std::ios::end); long currentEnd = positions.tellg(); if (currentEnd < 0) currentEnd = 0; positions.clear(); positions.seekg(currentEnd, std::ios::beg); _documents.push_back(docID); _microIndexOffsets.push_back(currentEnd); HCDBG(std::cerr << "_microIndexOffsets pushed back " << currentEnd << std::endl); HCDBG(std::cerr << "added title id of " << titleID << std::endl); _titles.push_back(titleID); getDocumentCompressor().writeOutMicroIndex(positions, locations, extents); } void Index::writeOutOffsets() { Compressor documents; int k1 = documents.minimize(_documents, 8); Compressor offsets; int k2 = offsets.compressAscending(_microIndexOffsets); Compressor titles; int k3 = titles.minimize(_titles, 8); // 8 is the starting k std::fstream &out = getOffsetsFile(); out.seekp(0); // position at beginning out.clear(); unsigned char byte; byte = static_cast(k1); out.write( (const char*)&byte, 1 ); HCDBG(fprintf(stderr, "a: offset dump of %x\n", byte)); documents.write(out); byte = static_cast(k2); out.write( (const char*)&byte, 1 ); HCDBG(fprintf(stderr, "b: offset dump of %x\n", byte)); offsets.write(out); byte = static_cast(k3); out.write( (const char*)&byte, 1 ); HCDBG(fprintf(stderr, "c: offset dump of %x\n", byte)); titles.write(out); } Index::~Index() { delete _schema; delete _dictParams; delete _dict; delete _positionsFile; delete _offsetsFile; delete _documentCompressor; } void XmlIndex::compress(int docID, int titleID, std::vector &locations, std::vector &extents, int k, const Compressor &contextTables) { HCDBG(std::cerr << "start compress" << std::endl); HCDBG(std::cerr << "docID : " << docID << " titleID : " << titleID << "locations size : " << locations.size() << "extents size : " << extents.size() << std::endl); Index::compress(docID, titleID, locations, extents); HCDBG(std::cerr << "end compress" << std::endl); std::fstream& contexts = getContextsFile(); contexts.seekp(0, std::ios::end); long currentEnd = contexts.tellp(); if (currentEnd < 0) currentEnd = 0; contexts.clear(); contexts.seekp(currentEnd); writeByte(contexts, static_cast(k)); contextTables.write(contexts); _contextsOffsets.push_back(currentEnd); } void XmlIndexBuilder::closeDocument(const std::string &title) throw( HelpProcessingException ) { if (_currentDocID == 0) { std::stringstream aStrStream; aStrStream << "no document open" << std::endl; throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); } else if (!_indexAdapter._locations.empty()) { IntegerArray kTable; Compressor compressor1; Compressor compressor2; Compressor compressor3; Compressor compressor4; kTable.push_back(compressor1.compressAscending(_indexAdapter._initialWords)); kTable.push_back(compressor2.minimize(_indexAdapter._dests, 2)); kTable.push_back(compressor3.minimize(_indexAdapter._links, 2)); kTable.push_back(compressor4.minimize(_indexAdapter._seqNumbers, 2)); Compressor compressor0; int k0 = compressor0.minimize(kTable, 4); compressor0.concatenate(compressor1); compressor0.concatenate(compressor2); compressor0.concatenate(compressor3); compressor0.concatenate(compressor4); std::vector dummy; _indexAdapter._index->compress(_currentDocID, intern(title), _indexAdapter._locations, dummy, k0, compressor0); } else { // System.out.println("no indexable content"); } _indexAdapter._locations.clear(); _currentDocID = 0; // state: nothing open } void XmlIndexBuilder::indexDocument(xmlDocPtr doc, const std::string &docURL, const std::string &title) { HCDBG(std::cerr << "Indexing " << docURL << std::endl); xmlNodePtr root = xmlDocGetRootElement(doc); openDocument(docURL); // xmlDocDump(stdout, doc); xmlDocPtr res = xsltApplyStylesheet(_indexingTransform, doc, NULL); _indexAdapter.init(); // start = System.currentTimeMillis(); root = xmlDocGetRootElement(res); if (root) { // xmlDocDump(stdout, res); for (xmlNodePtr test = root; test; test = test->next) _indexAdapter.process(test, res); } xmlFreeDoc(res); // System.out.println((System.currentTimeMillis()-start)+" transform"); // start = System.currentTimeMillis(); _indexAdapter.finish(); // System.out.println((System.currentTimeMillis()-start)+" finish"); // start = System.currentTimeMillis(); closeDocument(title); // System.out.println((System.currentTimeMillis()-start)+" close"); } XmlIndexBuilder::~XmlIndexBuilder() { delete _indexAdapter._index; } void XmlIndexBuilder::setTransformLocation(const fs::path &filelocation) { _transformLocation = filelocation; } xsltStylesheetPtr XmlIndexBuilder::getTransform(const std::string &stylesheetName) { fs::path stylesheet = _transformLocation / (stylesheetName + ".xsl"); return xsltParseStylesheetFile((const xmlChar *)stylesheet.native_file_string().c_str()); } void XmlIndexBuilder::initXmlProcessor(const std::string &transform) { _indexingTransform = getTransform(transform); } void XmlIndexBuilder::init(const std::string &transform) { _indexAdapter._index->init(); #ifdef EMULATEORIGINAL //some kind of bug in the original AFAICS _indexAdapter._stoplist.push_back("andnull"); #endif reset(); // initialize vector and hashtable const std::vector &linkNames = _indexAdapter._index->getLinkNames(); std::vector::const_iterator aEnd = linkNames.end(); for (std::vector::const_iterator aIter = linkNames.begin(); aIter != aEnd; ++aIter) { _indexAdapter.getLinkCode(*aIter); } initXmlProcessor(transform); } void XmlIndexBuilder::reset() { _indexAdapter._availContextNumber = 0; _indexAdapter._lastWordNumber = 0; _indexAdapter._locations.clear(); _indexAdapter._anyLocationsStored = false; // all the contexts' tables _indexAdapter._initialWords.clear(); _indexAdapter._dests.clear(); _indexAdapter._links.clear(); _indexAdapter._seqNumbers.clear(); } XmlIndexBuilder::XmlIndexBuilder(const fs::path &indexDir) : _indexingTransform(0), _currentDocID(0) { HCDBG(std::cerr << "indexDir is " << indexDir.native_directory_string() << std::endl); _indexAdapter._index = new XmlIndex(indexDir, true); } void XmlIndexBuilder::clearIndex() { _indexAdapter._index->clear(); } class HelpLinker { public: static void main(std::vector &args, std::string* pExtensionPath = NULL ) throw( HelpProcessingException ); static bool isExtensionMode( void ) {return bExtensionMode; } private: HelpLinker() : init(true), xmlIndexBuilder(NULL) {} ~HelpLinker() { delete xmlIndexBuilder; } JarOutputStream jarOutputStream; static int locCount, totCount; static Stringtable additionalFiles; static HashSet helpFiles; static fs::path sourceRoot; static fs::path embeddStylesheet; static fs::path indexStylesheet; static fs::path outputFile; static std::string module; static std::string lang; static std::string hid; static std::string extensionPath; static bool bExtensionMode; fs::path indexDirName; Stringtable hidlistTranslation; fs::path indexDirParentName; bool init; XmlIndexBuilder* xmlIndexBuilder; void initXMLIndexBuilder(); void createFileFromBytes(const std::string &fileName, const std::string &defaultXSL); void closeXMLIndexBuilder() { xmlIndexBuilder->close(); } void link() throw( HelpProcessingException ); void addBookmark( DB* dbBase, std::string thishid, const std::string& fileB, const std::string& anchorB, const std::string& jarfileB, const std::string& titleB ); #if 0 /** * @param outputFile * @param module * @param lang * @param hid * @param helpFiles * @param additionalFiles */ private HelpURLStreamHandlerFactory urlHandler = null; #endif }; bool isExtensionMode( void ) { return HelpLinker::isExtensionMode(); } namespace URLEncoder { static std::string encode(const std::string &rIn) { const char *good = "!$&'()*+,-.=@_"; static const char hex[17] = "0123456789ABCDEF"; std::string result; for (size_t i=0; i < rIn.length(); ++i) { unsigned char c = rIn[i]; if (isalnum (c) || strchr (good, c)) result += c; else { result += '%'; result += hex[c >> 4]; result += hex[c & 0xf]; } } return result; } } JarOutputStream::JarOutputStream() { perlline << "use Archive::Zip qw(:ERROR_CODES); "; perlline << "my $zip = Archive::Zip->new(); "; } std::string replaceAll(std::string result, const std::string &search, const std::string &replace) { std::string::size_type pos = 0; while(1) { pos = result.find(search, pos); if (pos == std::string::npos) break; result.replace(pos, search.size(), replace); pos += replace.size(); } return result; } void JarOutputStream::addFile(const std::string &fileName, const std::string &name) { perlline << "$zip->addFile(\"" << replaceAll(fileName, "\\", "/") << "\", \"" << name << "\"); "; } void JarOutputStream::addTree(const std::string &tree, const std::string &name) { perlline << "$zip->addTree(\"" << replaceAll(tree, "\\", "/") << "\", \"" << name << "\"); "; } void JarOutputStream::dontCompress(const std::string &key) { perlline << "my $member = $zip->memberNamed(\"" << key << "\"); "; perlline << "if ($member) { $member->desiredCompressionMethod( COMPRESSION_STORED ); } "; } void JarOutputStream::commit() { perlline << "print $zip->writeToFileNamed(\"" << replaceAll(getname().native_file_string(), "\\", "/") << "\").\"\\n\"; "; fs::path tmp = getname(); tmp.append(".perl"); std::string perlfile = replaceAll( tmp.native_file_string(), "\\", "/"); std::ofstream fos(perlfile.c_str()); fos << perlline.str(); fos.close(); std::string myperl("perl"); std::string is4nt; char* use_shell = getenv( "USE_SHELL" ); if ( use_shell ) is4nt = use_shell; if( !is4nt.empty() && is4nt == "4nt" ) { // in SO windows environment perl isn't in the path and // needs to be fetched from the environment. this doesn't // work in a cygwin shell as "/usr/bin/perl" will fail in a // native shell (see system call). myperl = getenv( "PERL" ); } std::string commandline; commandline = myperl + " " + perlfile; HCDBG(std::cerr << "command line 3 is" << commandline << std::endl); // on windows, calling perl (either cygwin or native) from a native // shell the only chance to survive is using "c:/foo" notation if ( system(commandline.c_str()) ) fprintf (stderr, "ERROR: calling generated perl script failed!\n"); fs::remove(tmp); } void HelpLinker::addBookmark( DB* dbBase, std::string thishid, const std::string& fileB, const std::string& anchorB, const std::string& jarfileB, const std::string& titleB) { HCDBG(std::cerr << "HelpLinker::addBookmark " << thishid << " " << fileB << " " << anchorB << " " << jarfileB << " " << titleB << std::endl); std::string temp = thishid; std::transform (temp.begin(), temp.end(), temp.begin(), toupper); std::replace(temp.begin(), temp.end(), ':', '_'); const std::string& translatedHid = hidlistTranslation[temp]; if (!translatedHid.empty()) thishid = translatedHid; thishid = URLEncoder::encode(thishid); DBT key; memset(&key, 0, sizeof(key)); key.data = const_cast(thishid.c_str()); key.size = thishid.length(); int fileLen = fileB.length(); if (!anchorB.empty()) fileLen += (1 + anchorB.length()); int dataLen = 1 + fileLen + 1 + jarfileB.length() + 1 + titleB.length(); std::vector dataB(dataLen); size_t i = 0; dataB[i++] = static_cast(fileLen); for (size_t j = 0; j < fileB.length(); ++j) dataB[i++] = fileB[j]; if (!anchorB.empty()) { dataB[i++] = '#'; for (size_t j = 0; j < anchorB.length(); ++j) dataB[i++] = anchorB[j]; } dataB[i++] = static_cast(jarfileB.length()); for (size_t j = 0; j < jarfileB.length(); ++j) dataB[i++] = jarfileB[j]; dataB[i++] = static_cast(titleB.length()); for (size_t j = 0; j < titleB.length(); ++j) dataB[i++] = titleB[j]; DBT data; memset(&data, 0, sizeof(data)); data.data = &dataB[0]; data.size = dataB.size(); dbBase->put(dbBase, NULL, &key, &data, 0); } void HelpLinker::createFileFromBytes(const std::string &fileName, const std::string &defaultXSL) { std::ofstream fos((indexDirParentName / fileName).native_file_string().c_str()); fos << defaultXSL; } void HelpLinker::initXMLIndexBuilder() { std::string mod = module; std::transform (mod.begin(), mod.end(), mod.begin(), tolower); indexDirName = indexDirParentName / (mod + ".idx"); fs::create_directory(indexDirName); if (xmlIndexBuilder) delete xmlIndexBuilder; xmlIndexBuilder = new XmlIndexBuilder(indexDirName); std::string defaultXSL = "\n" "\n" "\t\n" ""; createFileFromBytes("default.xsl", defaultXSL); xmlIndexBuilder->clearIndex(); // Build index from scratch xmlIndexBuilder->setTransformLocation(indexDirParentName); } namespace { fs::path gettmppath() { fs::path ret; osl::File::createTempFile(0, 0, &ret.data); fs::remove(ret); return ret; } } extern "C" void function_orig_pointer(xmlXPathParserContextPtr ctxt, int nargs) { if (nargs > 1) { // TODO: Change when used for extensions, no exception possible here std::cerr << "function_orig_pointer, too many args" << std::endl; exit(-1); } xmlNodePtr cur = NULL; if (nargs == 0) cur = ctxt->context->node; else if (nargs == 1) { xmlXPathObjectPtr obj = valuePop(ctxt); xmlNodeSetPtr nodelist = obj->nodesetval; if ((nodelist == NULL) || (nodelist->nodeNr <= 0)) { // TODO: Change when used for extensions, no exception possible here std::cerr << "function_orig_pointer, bad nodeset" << std::endl; exit(-1); } cur = nodelist->nodeTab[0]; for (int i = 1; i < nodelist->nodeNr; ++i) { int ret = xmlXPathCmpNodes(cur, nodelist->nodeTab[i]); if (ret == -1) cur = nodelist->nodeTab[i]; } xmlXPathFreeObject(obj); } if (cur == NULL) { // TODO: Change when used for extensions, no exception possible here std::cerr << "function_orig_pointer, bad node" << std::endl; exit(-1); } static xmlChar str[20]; sprintf((char *)str, "%ld", (sal_uIntPtr)(cur)); valuePush(ctxt, xmlXPathNewString(str)); } extern "C" void* cmc_module_init(xsltTransformContextPtr ctxt, const xmlChar* uri) { if (xsltRegisterExtFunction(ctxt, (const xmlChar*)"orig-pointer", uri, function_orig_pointer)) { // TODO: Change when used for extensions, no exception possible here std::cerr << "failure to register function_orig_pointer" << std::endl; exit(-1); } return NULL; } extern "C" void cmc_module_term(xsltTransformContextPtr, const xmlChar*, void*) { } /** * */ void HelpLinker::link() throw( HelpProcessingException ) { bool bIndexForExtension = false; // TODO if( bExtensionMode ) { indexDirParentName = sourceRoot; } else { indexDirParentName = gettmppath(); fs::create_directory(indexDirParentName); } #ifdef CMC_DEBUG std::cerr << "will not delete tmpdir of " << indexDirParentName.native_file_string().c_str() << std::endl; #endif std::string mod = module; std::transform (mod.begin(), mod.end(), mod.begin(), tolower); // Determine the outputstream fs::path outputTmpFile; if( !bExtensionMode ) { outputTmpFile = outputFile; outputTmpFile.append(".tmp"); jarOutputStream.setname(outputTmpFile); } // do the work here // continue with introduction of the overall process thing into the // here all hzip files will be worked on std::string appl = mod; if (appl[0] == 's') appl = appl.substr(1); fs::path helpTextFileName(indexDirParentName / (mod + ".ht")); DB* helpText(0); db_create(&helpText,0,0); helpText->open(helpText, NULL, helpTextFileName.native_file_string().c_str(), NULL, DB_BTREE, DB_CREATE | DB_TRUNCATE, 0644); fs::path dbBaseFileName(indexDirParentName / (mod + ".db")); DB* dbBase(0); db_create(&dbBase,0,0); dbBase->open(dbBase, NULL, dbBaseFileName.native_file_string().c_str(), NULL, DB_BTREE, DB_CREATE | DB_TRUNCATE, 0644); fs::path keyWordFileName(indexDirParentName / (mod + ".key")); DB* keyWord(0); db_create(&keyWord,0,0); keyWord->open(keyWord, NULL, keyWordFileName.native_file_string().c_str(), NULL, DB_BTREE, DB_CREATE | DB_TRUNCATE, 0644); HelpKeyword helpKeyword; // catch HelpProcessingException to avoid locking data bases try { std::ifstream fileReader(hid.c_str()); while (fileReader) { std::string key; fileReader >> key; std::transform (key.begin(), key.end(), key.begin(), toupper); std::replace(key.begin(), key.end(), ':', '_'); std::string data; fileReader >> data; if (!key.empty() && !data.empty()) hidlistTranslation[key] = data; } // lastly, initialize the indexBuilder if ( (!bExtensionMode || bIndexForExtension) && !helpFiles.empty()) initXMLIndexBuilder(); if( !bExtensionMode ) { std::cout << "Making " << outputFile.native_file_string() << " from " << helpFiles.size() << " input files" << std::endl; } // here we start our loop over the hzip files. HashSet::iterator end = helpFiles.end(); for (HashSet::iterator iter = helpFiles.begin(); iter != end; ++iter) { std::cout << "."; std::cout.flush(); // process one file // streamTable contains the streams in the hzip file StreamTable streamTable; const std::string &xhpFileName = *iter; if (!bExtensionMode && xhpFileName.rfind(".xhp") != xhpFileName.length()-4) { // only work on .xhp - files std::cerr << "ERROR: input list entry '" << xhpFileName << "' has the wrong extension (only files with extension .xhp " << "are accepted)"; continue; } fs::path langsourceRoot(sourceRoot); fs::path xhpFile; if( bExtensionMode ) { // langsourceRoot == sourceRoot for extensions std::string xhpFileNameComplete( extensionPath ); xhpFileNameComplete.append( '/' + xhpFileName ); xhpFile = fs::path( xhpFileNameComplete ); } else { langsourceRoot.append('/' + lang + '/'); xhpFile = fs::path(xhpFileName, fs::native); } HelpCompiler hc( streamTable, xhpFile, langsourceRoot, embeddStylesheet, module, lang, bExtensionMode ); HCDBG(std::cerr << "before compile of " << xhpFileName << std::endl); bool success = hc.compile(); HCDBG(std::cerr << "after compile of " << xhpFileName << std::endl); if (!success && !bExtensionMode) { std::stringstream aStrStream; aStrStream << "\nERROR: compiling help particle '" << xhpFileName << "' for language '" << lang << "' failed!"; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } const std::string documentBaseId = streamTable.document_id; std::string documentPath = streamTable.document_path; if (documentPath.find("/") == 0) documentPath = documentPath.substr(1); std::string documentJarfile = streamTable.document_module + ".jar"; std::string documentTitle = streamTable.document_title; if (documentTitle.empty()) documentTitle = ""; #if 0 std::cout << "for " << xhpFileName << " documentBaseId is " << documentBaseId << "\n"; std::cout << "for " << xhpFileName << " documentPath is " << documentPath << "\n"; std::cout << "for " << xhpFileName << " documentJarfile is " << documentJarfile << "\n"; std::cout << "for " << xhpFileName << " documentPath is " << documentTitle << "\n"; #endif const std::string& fileB = documentPath; const std::string& jarfileB = documentJarfile; std::string& titleB = documentTitle; // add once this as its own id. addBookmark(dbBase, documentPath, fileB, std::string(), jarfileB, titleB); if ( (!bExtensionMode || bIndexForExtension) && init) { std::ifstream indexXSLFile(indexStylesheet.native_file_string().c_str()); std::ostringstream baos; baos << indexXSLFile.rdbuf(); std::string xsl = baos.str(); //I see that we later generate a map of generateids to nodes which we will use //to link the results of generate-id in the transformed document back to the nodes //in the original document, so let's cut out the middle-men and make an extension //which does exactly what we want, and give us a pointer to the original node xsl.replace(xsl.find("init("index"); init = false; } // first the database *.db // ByteArrayInputStream bais = null; // ObjectInputStream ois = null; const HashSet *hidlist = streamTable.appl_hidlist; if (!hidlist) hidlist = streamTable.default_hidlist; if (hidlist && !hidlist->empty()) { // now iterate over all elements of the hidlist HashSet::const_iterator aEnd = hidlist->end(); for (HashSet::const_iterator hidListIter = hidlist->begin(); hidListIter != aEnd; ++hidListIter) { std::string thishid = *hidListIter; std::string anchorB; size_t index = thishid.rfind('#'); if (index != std::string::npos) { anchorB = thishid.substr(1 + index); thishid = thishid.substr(0, index); } addBookmark(dbBase, thishid, fileB, anchorB, jarfileB, titleB); } } // now the keywords const Hashtable *anchorToLL = streamTable.appl_keywords; if (!anchorToLL) anchorToLL = streamTable.default_keywords; if (anchorToLL && !anchorToLL->empty()) { std::string fakedHid = URLEncoder::encode(documentPath); Hashtable::const_iterator aEnd = anchorToLL->end(); for (Hashtable::const_iterator enumer = anchorToLL->begin(); enumer != aEnd; ++enumer) { const std::string &anchor = enumer->first; addBookmark(dbBase, documentPath, fileB, anchor, jarfileB, titleB); std::string totalId = fakedHid + "#" + anchor; // std::cerr << hzipFileName << std::endl; const LinkedList& ll = enumer->second; LinkedList::const_iterator aOtherEnd = ll.end(); for (LinkedList::const_iterator llIter = ll.begin(); llIter != aOtherEnd; ++llIter) { helpKeyword.insert(*llIter, totalId); } } } // and last the helptexts const Stringtable *helpTextHash = streamTable.appl_helptexts; if (!helpTextHash) helpTextHash = streamTable.default_helptexts; if (helpTextHash && !helpTextHash->empty()) { Stringtable::const_iterator aEnd = helpTextHash->end(); for (Stringtable::const_iterator helpTextIter = helpTextHash->begin(); helpTextIter != aEnd; ++helpTextIter) { std::string helpTextId = helpTextIter->first; const std::string& helpTextText = helpTextIter->second; std::string temp = helpTextId; std::transform (temp.begin(), temp.end(), temp.begin(), toupper); std::replace(temp.begin(), temp.end(), ':', '_'); const std::string& tHid = hidlistTranslation[temp]; if (!tHid.empty()) helpTextId = tHid; helpTextId = URLEncoder::encode(helpTextId); DBT keyDbt; memset(&keyDbt, 0, sizeof(keyDbt)); keyDbt.data = const_cast(helpTextId.c_str()); keyDbt.size = helpTextId.length(); DBT textDbt; memset(&textDbt, 0, sizeof(textDbt)); textDbt.data = const_cast(helpTextText.c_str()); textDbt.size = helpTextText.length(); helpText->put(helpText, NULL, &keyDbt, &textDbt, 0); } } if( !bExtensionMode || bIndexForExtension ) { // now the indexing xmlDocPtr document = streamTable.appl_doc; if (!document) document = streamTable.default_doc; if (document) { std::string temp = module; std::transform (temp.begin(), temp.end(), temp.begin(), tolower); xmlIndexBuilder->indexDocument(document, std::string("vnd.sun.star.help://") + temp + "/" + URLEncoder::encode(documentPath), ""); } } } // while loop over hzip files ending if( !bExtensionMode ) std::cout << std::endl; } // try catch( HelpProcessingException& ) { // catch HelpProcessingException to avoid locking data bases helpText->close(helpText, 0); dbBase->close(dbBase, 0); keyWord->close(keyWord, 0); throw; } helpText->close(helpText, 0); dbBase->close(dbBase, 0); helpKeyword.dump(keyWord); keyWord->close(keyWord, 0); if (!bExtensionMode && !helpFiles.empty()) { closeXMLIndexBuilder(); HCDBG(std::cerr << "dir is " << indexDirName.native_directory_string() << std::endl); jarOutputStream.addTree(indexDirName.native_file_string(), mod + ".idx"); } if( !bExtensionMode ) { jarOutputStream.addFile(helpTextFileName.native_file_string(), mod + ".ht"); jarOutputStream.addFile(dbBaseFileName.native_file_string(), mod + ".db"); jarOutputStream.addFile(keyWordFileName.native_file_string(), mod + ".key"); ///////////////////////////////////////////////////////////////////////// // last, all files which should be copied into the jar file ///////////////////////////////////////////////////////////////////////// Stringtable::iterator aEnd = additionalFiles.end(); for (Stringtable::iterator enumer = additionalFiles.begin(); enumer != aEnd; ++enumer) { const std::string &additionalFileKey = enumer->first; const std::string &additionalFileName = enumer->second; jarOutputStream.addFile(additionalFileName, additionalFileKey); } jarOutputStream.dontCompress(mod + ".jar"); jarOutputStream.commit(); HCDBG(std::cerr << "like to rename " << outputTmpFile.native_file_string() << " as " << outputFile.native_file_string() << std::endl); fs::rename(outputTmpFile, outputFile); if (!fs::exists(outputFile)) { std::stringstream aStrStream; aStrStream << "can't rename file '" << outputTmpFile.native_file_string() << "'" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } } ///////////////////////////////////////////////////////////////////////// /// remove temprary directory for index creation ///////////////////////////////////////////////////////////////////////// #ifndef CMC_DEBUG if( !bExtensionMode ) fs::remove_all( indexDirParentName ); #endif } int HelpLinker::locCount; int HelpLinker::totCount; Stringtable HelpLinker::additionalFiles; HashSet HelpLinker::helpFiles; fs::path HelpLinker::sourceRoot; fs::path HelpLinker::embeddStylesheet, HelpLinker::indexStylesheet; fs::path HelpLinker::outputFile; std::string HelpLinker::module; std::string HelpLinker::lang; std::string HelpLinker::hid; std::string HelpLinker::extensionPath; bool HelpLinker::bExtensionMode; int GnTmpFileCounter = 0; void HelpLinker::main(std::vector &args, std::string* pExtensionPath) throw( HelpProcessingException ) { bExtensionMode = false; if( pExtensionPath && pExtensionPath->length() > 0 ) { helpFiles.clear(); bExtensionMode = true; extensionPath = *pExtensionPath; sourceRoot = fs::path(extensionPath); } if (args.size() > 0 && args[0][0] == '@') { std::vector stringList; std::string strBuf; std::ifstream fileReader(args[0].substr(1).c_str()); while (fileReader) { std::string token; fileReader >> token; if (!token.empty()) stringList.push_back(token); } args = stringList; } size_t i = 0; while (i < args.size()) { if (args[i].compare("-src") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "sourceroot missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if( !bExtensionMode ) sourceRoot = fs::path(args[i], fs::native); } else if (args[i].compare("-sty") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "embeddingStylesheet missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } embeddStylesheet = fs::path(args[i], fs::native); } else if (args[i].compare("-idx") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "indexstylesheet missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } indexStylesheet = fs::path(args[i], fs::native); } else if (args[i].compare("-o") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "outputfilename missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } outputFile = fs::path(args[i], fs::native); } else if (args[i].compare("-mod") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "module name missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } module = args[i]; } else if (args[i].compare("-lang") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "language name missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } lang = args[i]; } else if (args[i].compare("-hid") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "hid list missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } hid = args[i]; } else if (args[i].compare("-add") == 0) { std::string addFile, addFileUnderPath; ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "pathname missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } addFileUnderPath = args[i]; ++i; if (i >= args.size()) { std::stringstream aStrStream; aStrStream << "pathname missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } addFile = args[i]; if (!addFileUnderPath.empty() && !addFile.empty()) additionalFiles[addFileUnderPath] = addFile; } else helpFiles.push_back(args[i]); ++i; } if (!bExtensionMode && indexStylesheet.empty()) { std::stringstream aStrStream; aStrStream << "no index file given" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (!bExtensionMode && embeddStylesheet.empty()) { std::stringstream aStrStream; aStrStream << "no embedding resolving file given" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (sourceRoot.empty()) { std::stringstream aStrStream; aStrStream << "no sourceroot given" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (!bExtensionMode && outputFile.empty()) { std::stringstream aStrStream; aStrStream << "no output file given" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (module.empty()) { std::stringstream aStrStream; aStrStream << "module missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (!bExtensionMode && lang.empty()) { std::stringstream aStrStream; aStrStream << "language missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (!bExtensionMode && hid.empty()) { std::stringstream aStrStream; aStrStream << "hid list missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } HelpLinker().link(); } int main(int argc, char**argv) { sal_uInt32 starttime = osl_getGlobalTimer(); std::vector args; for (int i = 1; i < argc; ++i) args.push_back(std::string(argv[i])); try { HelpLinker::main(args); } catch( const HelpProcessingException& e ) { std::cerr << e.m_aErrorMsg; exit(1); } sal_uInt32 endtime = osl_getGlobalTimer(); std::cout << "time taken was " << (endtime-starttime)/1000.0 << " seconds" << std::endl; return 0; } // Variable to set an exception in "C" StructuredXMLErrorFunction static const HelpProcessingException* GpXMLParsingException = NULL; extern "C" void StructuredXMLErrorFunction(void *userData, xmlErrorPtr error) { (void)userData; (void)error; std::string aErrorMsg = error->message; std::string aXMLParsingFile; if( error->file != NULL ) aXMLParsingFile = error->file; int nXMLParsingLine = error->line; HelpProcessingException* pException = new HelpProcessingException( aErrorMsg, aXMLParsingFile, nXMLParsingLine ); GpXMLParsingException = pException; // Reset error handler xmlSetStructuredErrorFunc( NULL, NULL ); } HelpProcessingErrorInfo& HelpProcessingErrorInfo::operator=( const struct HelpProcessingException& e ) { m_eErrorClass = e.m_eErrorClass; rtl::OString tmpErrorMsg( e.m_aErrorMsg.c_str() ); m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, osl_getThreadTextEncoding() ); rtl::OString tmpXMLParsingFile( e.m_aXMLParsingFile.c_str() ); m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, osl_getThreadTextEncoding() ); m_nXMLParsingLine = e.m_nXMLParsingLine; return *this; } // Returns true in case of success, false in case of error HELPLINKER_DLLPUBLIC bool compileExtensionHelp ( const rtl::OUString& aExtensionName, const rtl::OUString& aExtensionLanguageRoot, sal_Int32 nXhpFileCount, const rtl::OUString* pXhpFiles, HelpProcessingErrorInfo& o_rHelpProcessingErrorInfo ) { bool bSuccess = true; sal_Int32 argc = nXhpFileCount + 3; const char** argv = new const char*[argc]; argv[0] = ""; argv[1] = "-mod"; rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, osl_getThreadTextEncoding() ); argv[2] = aOExtensionName.getStr(); for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp ) { rtl::OUString aXhpFile = pXhpFiles[iXhp]; rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, osl_getThreadTextEncoding() ); char* pArgStr = new char[aOXhpFile.getLength() + 1]; strcpy( pArgStr, aOXhpFile.getStr() ); argv[iXhp + 3] = pArgStr; } std::vector args; for( sal_Int32 i = 1; i < argc; ++i ) args.push_back(std::string( argv[i]) ); for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp ) delete argv[iXhp + 3]; delete[] argv; rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, osl_getThreadTextEncoding() ); const char* pExtensionPath = aOExtensionLanguageRoot.getStr(); std::string aStdStrExtensionPath = pExtensionPath; // Set error handler xmlSetStructuredErrorFunc( NULL, (xmlStructuredErrorFunc)StructuredXMLErrorFunction ); try { HelpLinker::main(args,&aStdStrExtensionPath); } catch( const HelpProcessingException& e ) { if( GpXMLParsingException != NULL ) { o_rHelpProcessingErrorInfo = *GpXMLParsingException; delete GpXMLParsingException; GpXMLParsingException = NULL; } else { o_rHelpProcessingErrorInfo = e; } bSuccess = false; } // Reset error handler xmlSetStructuredErrorFunc( NULL, NULL ); // i83624: Tree files ::rtl::OUString aTreeFileURL = aExtensionLanguageRoot; aTreeFileURL += rtl::OUString::createFromAscii( "/help.tree" ); osl::DirectoryItem aTreeFileItem; osl::FileBase::RC rcGet = osl::DirectoryItem::get( aTreeFileURL, aTreeFileItem ); osl::FileStatus aFileStatus( FileStatusMask_FileSize ); if( rcGet == osl::FileBase::E_None && aTreeFileItem.getFileStatus( aFileStatus ) == osl::FileBase::E_None && aFileStatus.isValid( FileStatusMask_FileSize ) ) { sal_uInt64 ret, len = aFileStatus.getFileSize(); char* s = new char[ int(len) ]; // the buffer to hold the installed files osl::File aFile( aTreeFileURL ); aFile.open( OpenFlag_Read ); aFile.read( s, len, ret ); aFile.close(); XML_Parser parser = XML_ParserCreate( 0 ); int parsed = XML_Parse( parser, s, int( len ), true ); if( parsed == 0 ) { XML_Error nError = XML_GetErrorCode( parser ); o_rHelpProcessingErrorInfo.m_eErrorClass = HELPPROCESSING_XMLPARSING_ERROR; o_rHelpProcessingErrorInfo.m_aErrorMsg = rtl::OUString::createFromAscii( XML_ErrorString( nError ) );; o_rHelpProcessingErrorInfo.m_aXMLParsingFile = aTreeFileURL; // CRAHSES!!! o_rHelpProcessingErrorInfo.m_nXMLParsingLine = XML_GetCurrentLineNumber( parser ); bSuccess = false; } XML_ParserFree( parser ); delete[] s; } return bSuccess; } // vnd.sun.star.help://swriter/52821?Language=en-US&System=UNIX /* vi:set tabstop=4 shiftwidth=4 expandtab: */