2010-10-12 15:57:08 +02:00
|
|
|
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
2012-10-08 10:00:18 +01:00
|
|
|
/*
|
|
|
|
* This file is part of the LibreOffice project.
|
2007-06-19 15:01:46 +00:00
|
|
|
*
|
2012-10-08 10:00:18 +01:00
|
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
2007-06-19 15:01:46 +00:00
|
|
|
*
|
2012-10-08 10:00:18 +01:00
|
|
|
* This file incorporates work covered by the following license notice:
|
2007-06-19 15:01:46 +00:00
|
|
|
*
|
2012-10-08 10:00:18 +01:00
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
|
|
* with this work for additional information regarding copyright
|
|
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
|
|
* except in compliance with the License. You may obtain a copy of
|
|
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
|
|
*/
|
2007-06-19 15:01:46 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
*
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* TODO
|
|
|
|
* - Add exception throwing when h == NULL
|
|
|
|
* - Not init h when implicit constructor is launched
|
|
|
|
*/
|
2007-06-21 06:56:38 +00:00
|
|
|
|
2007-06-19 15:01:46 +00:00
|
|
|
#include <string.h>
|
|
|
|
#include <sstream>
|
|
|
|
#include <iostream>
|
|
|
|
|
2012-11-10 10:53:51 +01:00
|
|
|
#ifdef SYSTEM_LIBEXTTEXTCAT
|
2011-10-04 22:34:32 +02:00
|
|
|
#include <libexttextcat/textcat.h>
|
|
|
|
#include <libexttextcat/common.h>
|
|
|
|
#include <libexttextcat/constants.h>
|
|
|
|
#include <libexttextcat/fingerprint.h>
|
|
|
|
#include <libexttextcat/utf8misc.h>
|
2012-11-09 17:25:56 -05:00
|
|
|
#else
|
|
|
|
#include <textcat.h>
|
|
|
|
#include <common.h>
|
|
|
|
#include <constants.h>
|
|
|
|
#include <fingerprint.h>
|
|
|
|
#include <utf8misc.h>
|
|
|
|
#endif
|
2007-06-19 15:01:46 +00:00
|
|
|
|
|
|
|
#include <sal/types.h>
|
|
|
|
|
|
|
|
#include "altstrfunc.hxx"
|
|
|
|
#include "simpleguesser.hxx"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This 3 following structures are from fingerprint.c and textcat.c
|
|
|
|
*/
|
|
|
|
typedef struct ngram_t {
|
|
|
|
|
|
|
|
sint2 rank;
|
|
|
|
char str[MAXNGRAMSIZE+1];
|
|
|
|
|
|
|
|
} ngram_t;
|
|
|
|
|
|
|
|
typedef struct fp_t {
|
|
|
|
|
|
|
|
const char *name;
|
|
|
|
ngram_t *fprint;
|
|
|
|
uint4 size;
|
|
|
|
|
|
|
|
} fp_t;
|
|
|
|
|
|
|
|
typedef struct textcat_t{
|
|
|
|
|
|
|
|
void **fprint;
|
|
|
|
char *fprint_disable;
|
|
|
|
uint4 size;
|
|
|
|
uint4 maxsize;
|
|
|
|
|
|
|
|
char output[MAXOUTPUTSIZE];
|
|
|
|
|
|
|
|
} textcat_t;
|
2014-02-26 16:06:37 +01:00
|
|
|
// end of the 3 structs
|
2007-06-19 15:01:46 +00:00
|
|
|
|
|
|
|
SimpleGuesser::SimpleGuesser()
|
|
|
|
{
|
|
|
|
h = NULL;
|
|
|
|
}
|
|
|
|
|
2012-12-29 13:36:43 +01:00
|
|
|
SimpleGuesser& SimpleGuesser::operator=(const SimpleGuesser& sg){
|
|
|
|
// Check for self-assignment!
|
|
|
|
if (this == &sg) // Same object?
|
2014-02-26 16:06:37 +01:00
|
|
|
return *this; // Yes, so skip assignment, and just return *this.
|
2012-12-29 13:36:43 +01:00
|
|
|
|
2007-06-19 15:01:46 +00:00
|
|
|
if(h){textcat_Done(h);}
|
|
|
|
h = sg.h;
|
2012-12-29 13:36:43 +01:00
|
|
|
return *this;
|
2007-06-19 15:01:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
SimpleGuesser::~SimpleGuesser()
|
|
|
|
{
|
|
|
|
if(h){textcat_Done(h);}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
\fn SimpleGuesser::GuessLanguage(char* text)
|
|
|
|
*/
|
2011-09-26 22:32:26 +01:00
|
|
|
vector<Guess> SimpleGuesser::GuessLanguage(const char* text)
|
2007-06-19 15:01:46 +00:00
|
|
|
{
|
2011-09-26 22:23:37 +01:00
|
|
|
vector<Guess> guesses;
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
if (!h)
|
|
|
|
return guesses;
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2012-05-25 15:51:04 +01:00
|
|
|
int len = strlen(text);
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
if (len > MAX_STRING_LENGTH_TO_ANALYSE)
|
|
|
|
len = MAX_STRING_LENGTH_TO_ANALYSE;
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:32:26 +01:00
|
|
|
const char *guess_list = textcat_Classify(h, text, len);
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2013-03-25 19:09:50 +01:00
|
|
|
// FIXME just a temporary check until new version with renamed macros deployed
|
|
|
|
#if EXTTEXTCAT_VERSION_MAJOR > 3 || (EXTTEXTCAT_VERSION_MAJOR == 3 && (EXTTEXTCAT_VERSION_MINOR > 4 || (EXTTEXTCAT_VERSION_MINOR == 4 && (EXTTEXTCAT_VERSION_MICRO >= 1))))
|
2013-03-25 18:19:45 +01:00
|
|
|
if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0)
|
2013-03-25 19:09:50 +01:00
|
|
|
#else
|
|
|
|
if (strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0)
|
|
|
|
#endif
|
2011-09-26 22:23:37 +01:00
|
|
|
return guesses;
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
int current_pointer = 0;
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
for(int i = 0; guess_list[current_pointer] != '\0'; i++)
|
|
|
|
{
|
2011-09-26 22:32:26 +01:00
|
|
|
while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0')
|
2011-09-26 22:23:37 +01:00
|
|
|
current_pointer++;
|
|
|
|
if(guess_list[current_pointer] != '\0')
|
2007-06-19 15:01:46 +00:00
|
|
|
{
|
2011-09-26 22:32:26 +01:00
|
|
|
Guess g(guess_list + current_pointer);
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
guesses.push_back(g);
|
2007-06-19 15:01:46 +00:00
|
|
|
|
2011-09-26 22:23:37 +01:00
|
|
|
current_pointer++;
|
2007-06-19 15:01:46 +00:00
|
|
|
}
|
2011-09-26 22:23:37 +01:00
|
|
|
}
|
2007-06-19 15:01:46 +00:00
|
|
|
|
|
|
|
return guesses;
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:32:26 +01:00
|
|
|
Guess SimpleGuesser::GuessPrimaryLanguage(const char* text)
|
2007-06-19 15:01:46 +00:00
|
|
|
{
|
|
|
|
vector<Guess> ret = GuessLanguage(text);
|
2011-09-26 22:38:26 +01:00
|
|
|
return ret.empty() ? Guess() : ret[0];
|
2007-06-19 15:01:46 +00:00
|
|
|
}
|
|
|
|
/**
|
2013-05-06 20:07:23 +02:00
|
|
|
* Is used to know which language is available, unavailable or both
|
2007-06-19 15:01:46 +00:00
|
|
|
* when mask = 0xF0, return only Available
|
|
|
|
* when mask = 0x0F, return only Unavailable
|
|
|
|
* when mask = 0xFF, return both Available and Unavailable
|
|
|
|
*/
|
|
|
|
vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
|
|
|
|
{
|
|
|
|
textcat_t *tables = (textcat_t*)h;
|
|
|
|
|
|
|
|
vector<Guess> lang;
|
|
|
|
if(!h){return lang;}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
for (size_t i=0; i<tables->size; ++i)
|
|
|
|
{
|
|
|
|
if (tables->fprint_disable[i] & mask)
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
string langStr = "[";
|
2011-09-26 22:38:26 +01:00
|
|
|
langStr += fp_Name(tables->fprint[i]);
|
|
|
|
Guess g(langStr.c_str());
|
2007-06-19 15:01:46 +00:00
|
|
|
lang.push_back(g);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return lang;
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
vector<Guess> SimpleGuesser::GetAvailableLanguages()
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
vector<Guess> SimpleGuesser::GetUnavailableLanguages()
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
vector<Guess> SimpleGuesser::GetAllManagedLanguages()
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
void SimpleGuesser::XableLanguage(string lang, char mask)
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
textcat_t *tables = (textcat_t*)h;
|
|
|
|
|
|
|
|
if(!h){return;}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
for (size_t i=0; i<tables->size; i++)
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
string language(fp_Name(tables->fprint[i]));
|
2011-09-26 22:38:26 +01:00
|
|
|
if (start(language,lang) == 0)
|
2007-06-19 15:01:46 +00:00
|
|
|
tables->fprint_disable[i] = mask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
void SimpleGuesser::EnableLanguage(string lang)
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
void SimpleGuesser::DisableLanguage(string lang)
|
|
|
|
{
|
2007-06-19 15:01:46 +00:00
|
|
|
XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
|
|
|
|
}
|
|
|
|
|
2011-09-26 22:38:26 +01:00
|
|
|
void SimpleGuesser::SetDBPath(const char* path, const char* prefix)
|
|
|
|
{
|
|
|
|
if (h)
|
2007-06-19 15:01:46 +00:00
|
|
|
textcat_Done(h);
|
|
|
|
h = special_textcat_Init(path, prefix);
|
|
|
|
}
|
2010-10-12 15:57:08 +02:00
|
|
|
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|