related: fdo#73682 Introduce HTML detection service

Change-Id: I66bb579019ce8411b821c623955a454fd81cf811
Reviewed-on: https://gerrit.libreoffice.org/7600
Reviewed-by: Kohei Yoshida <libreoffice@kohei.us>
Tested-by: Kohei Yoshida <libreoffice@kohei.us>
This commit is contained in:
Maxim Monastirsky 2014-01-20 10:17:05 +02:00 committed by Kohei Yoshida
parent 6063555744
commit cc2893834d
10 changed files with 388 additions and 1 deletions

View File

@ -270,6 +270,7 @@ $(eval $(call gb_Helper_register_libraries_for_install,OOOLIBS,ooo, \
$(if $(ENABLE_DIRECTX),gdipluscanvas) \
guesslang \
$(if $(filter DESKTOP,$(BUILD_TYPE)),helplinker) \
htmlfd \
i18npool \
i18nsearch \
hyphen \

36
filter/Library_htmlfd.mk Normal file
View File

@ -0,0 +1,36 @@
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
#*************************************************************************
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
#*************************************************************************
$(eval $(call gb_Library_Library,htmlfd))
$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd))
$(eval $(call gb_Library_use_external,xmlfd,boost_headers))
$(eval $(call gb_Library_use_sdk_api,htmlfd))
$(eval $(call gb_Library_use_libraries,htmlfd,\
ucbhelper \
cppuhelper \
cppu \
sal \
tl \
utl \
svt \
$(gb_UWINAPI) \
))
$(eval $(call gb_Library_add_exception_objects,htmlfd,\
filter/source/htmlfilterdetect/fdcomp \
filter/source/htmlfilterdetect/filterdetect \
))
# vim: set noet sw=4 ts=4:

View File

@ -34,6 +34,7 @@ $(eval $(call gb_Module_add_targets,filter,\
Library_exp) \
Library_filterconfig \
Library_flash \
Library_htmlfd \
Library_icd \
Library_icg \
Library_idx \

View File

@ -16,7 +16,7 @@
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
-->
<node oor:name="generic_HTML" oor:op="replace" >
<prop oor:name="DetectService"><value>com.sun.star.text.FormatDetector</value></prop>
<prop oor:name="DetectService"><value>com.sun.star.comp.filters.HtmlFilterDetect</value></prop>
<prop oor:name="URLPattern"><value>private:factory/swriter/web*</value></prop>
<prop oor:name="Extensions"><value>html htm</value></prop>
<prop oor:name="MediaType"><value>text/html</value></prop>

View File

@ -0,0 +1,36 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <sal/config.h>
#include <cppuhelper/factory.hxx>
#include <cppuhelper/implementationentry.hxx>
#include <sal/types.h>
#include "filterdetect.hxx"
namespace {
static cppu::ImplementationEntry const services[] = {
{ &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName,
&HtmlFilterDetect_getSupportedServiceNames,
&cppu::createSingleComponentFactory, 0, 0 },
{ 0, 0, 0, 0, 0, 0 }
};
}
extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory(
char const * pImplName, void * pServiceManager, void * pRegistryKey)
{
return cppu::component_getFactoryHelper(
pImplName, pServiceManager, pRegistryKey, services);
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View File

@ -0,0 +1,232 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include "filterdetect.hxx"
#include <svtools/htmltokn.h>
#include <tools/urlobj.hxx>
#include <ucbhelper/content.hxx>
#include <unotools/mediadescriptor.hxx>
#include <unotools/ucbstreamhelper.hxx>
#include <com/sun/star/io/XInputStream.hpp>
#include <cppuhelper/supportsservice.hxx>
#include <boost/scoped_ptr.hpp>
using com::sun::star::io::XInputStream;
using com::sun::star::uno::Sequence;
using com::sun::star::uno::Reference;
using com::sun::star::uno::Any;
using com::sun::star::uno::XComponentContext;
using com::sun::star::uno::XInterface;
using com::sun::star::uno::Exception;
using com::sun::star::uno::RuntimeException;
using com::sun::star::ucb::XCommandEnvironment;
using namespace com::sun::star;
using namespace com::sun::star::beans;
namespace {
enum DetectPhase {
BeforeTag,
TagOpened,
InTagName
};
bool isHTMLStream(const OString& aStreamHeader)
{
const char* pHeader = aStreamHeader.getStr();
const int nLength = aStreamHeader.getLength();
int nStartOfTagIndex = 0;
int i = 0;
DetectPhase dp = BeforeTag;
for ( i = 0; i < nLength; ++i, ++pHeader )
{
char c = *pHeader;
if ( c == ' ' || c == '\n' || c == '\t' )
{
if ( dp == TagOpened )
return false; // Invalid: Should start with a tag name
else if ( dp == InTagName )
break; // End of tag name reached
}
else if ( c == '<' )
{
if ( dp == BeforeTag )
dp = TagOpened;
else
return false; // Invalid: Nested '<'
}
else if ( c == '>' )
{
if ( dp == InTagName )
break; // End of tag name reached
else
return false; // Invalid: Empty tag or before '<'
}
else if ( c == '!' )
{
if ( i == 1 && dp == TagOpened )
return true; // "<!" at the very beginning of the file
else
return false; // Invalid: '!' before '<' or inside tag name
}
else
{
if ( dp == BeforeTag )
return false; // Invalid: Should start with a tag
else if ( dp == TagOpened )
{
nStartOfTagIndex = i;
dp = InTagName;
}
}
}
// The string following '<' has to be a known HTML token.
if ( GetHTMLToken( OStringToOUString( aStreamHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ),
RTL_TEXTENCODING_ASCII_US ) ) != 0 )
return true;
return false;
}
}
OUString SAL_CALL HtmlFilterDetect::detect(Sequence<PropertyValue>& lDescriptor)
throw (RuntimeException)
{
OUString sUrl;
OUString sDocService;
OString resultString;
Reference<XInputStream> xInStream;
const PropertyValue *pValue = lDescriptor.getConstArray();
sal_Int32 nLength = lDescriptor.getLength();
sal_Int32 location = nLength;
for ( sal_Int32 i = 0; i < nLength; ++i )
{
if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() )
pValue[i].Value >>= sUrl;
else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() )
pValue[i].Value >>= xInStream;
else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() )
{
location = i;
pValue[i].Value >>= sDocService;
}
}
try
{
if ( !xInStream.is() )
{
ucbhelper::Content aContent( sUrl, Reference<XCommandEnvironment>(), mxCtx );
xInStream = aContent.openStream();
if ( !xInStream.is() )
return OUString();
}
boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
if ( !pInStream || pInStream->GetError() )
return OUString();
pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
sal_Size nUniPos = pInStream->Tell();
const sal_uInt16 nSize = 4096;
if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
resultString = read_uInt8s_ToOString( *pInStream, nSize );
else // UTF-16
resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
if ( isHTMLStream( resultString.toAsciiLowerCase() ) )
{
// Some Apps/Web services use ".xls" extension to indicate that
// the given file should be opened by a spreadsheet software
if ( sDocService.isEmpty() )
{
INetURLObject aParser( sUrl );
OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET );
aExt = aExt.toAsciiLowerCase();
if ( aExt == "xls" )
{
if ( location == lDescriptor.getLength() )
{
lDescriptor.realloc( location + 1 );
lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE();
}
lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" );
}
}
return OUString( "generic_HTML" );
}
}
catch (const Exception &)
{
OSL_FAIL( "An Exception occurred while opening File stream" );
}
return OUString(); // Failed
}
// XInitialization
void SAL_CALL HtmlFilterDetect::initialize(const Sequence<Any>& /*aArguments*/)
throw (Exception, RuntimeException)
{
}
OUString HtmlFilterDetect_getImplementationName()
{
return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" );
}
Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames()
{
Sequence<OUString> aRet(2);
OUString* pArray = aRet.getArray();
pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect";
return aRet;
}
Reference<XInterface> HtmlFilterDetect_createInstance(const Reference<XComponentContext>& rCtx)
{
return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx );
}
// XServiceInfo
OUString SAL_CALL HtmlFilterDetect::getImplementationName()
throw (RuntimeException)
{
return HtmlFilterDetect_getImplementationName();
}
sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName)
throw (RuntimeException)
{
return cppu::supportsService( this, rServiceName );
}
Sequence<OUString> SAL_CALL HtmlFilterDetect::getSupportedServiceNames()
throw (RuntimeException)
{
return HtmlFilterDetect_getSupportedServiceNames();
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View File

@ -0,0 +1,64 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
#include <com/sun/star/document/XExtendedFilterDetection.hpp>
#include <com/sun/star/lang/XInitialization.hpp>
#include <com/sun/star/lang/XServiceInfo.hpp>
#include <com/sun/star/uno/XComponentContext.hpp>
#include <cppuhelper/implbase3.hxx>
class HtmlFilterDetect : public cppu::WeakImplHelper3<
com::sun::star::document::XExtendedFilterDetection,
com::sun::star::lang::XInitialization,
com::sun::star::lang::XServiceInfo>
{
com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext> mxCtx;
public:
HtmlFilterDetect(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& xCtx) :
mxCtx(xCtx) {}
virtual ~HtmlFilterDetect() {}
// XExtendedFilterDetection
virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence<com::sun::star::beans::PropertyValue>& lDescriptor)
throw (com::sun::star::uno::RuntimeException);
// XInitialization
virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence<com::sun::star::uno::Any>& aArguments)
throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException);
// XServiceInfo
virtual OUString SAL_CALL getImplementationName()
throw (com::sun::star::uno::RuntimeException);
virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName)
throw (com::sun::star::uno::RuntimeException);
virtual com::sun::star::uno::Sequence<OUString> SAL_CALL getSupportedServiceNames()
throw (com::sun::star::uno::RuntimeException);
};
OUString HtmlFilterDetect_getImplementationName();
com::sun::star::uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames();
com::sun::star::uno::Reference<com::sun::star::uno::XInterface>
HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& rCtx);
#endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
-->
<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
prefix="htmlfd" xmlns="http://openoffice.org/2010/uno-components">
<implementation name="com.sun.star.comp.filters.HtmlFilterDetect">
<service name="com.sun.star.document.ExtendedTypeDetection"/>
</implementation>
</component>

View File

@ -29,6 +29,7 @@ $(eval $(call gb_Rdb_add_components,services,\
filter/source/config/cache/filterconfig1 \
filter/source/flash/flash \
filter/source/graphic/graphicfilter \
filter/source/htmlfilterdetect/htmlfd \
filter/source/msfilter/msfilter \
filter/source/odfflatxml/odfflatxml \
filter/source/pdf/pdffilter \

View File

@ -46,6 +46,7 @@ gb_EXTRAMERGEDLIBS := \
graphicfilter \
guesslang \
$(if $(ENABLE_JAVA),hsqldb) \
htmlfd \
hyphen \
icd \
icg \