From cc2893834d8ac699dbb38b152f21f17f3debb06b Mon Sep 17 00:00:00 2001 From: Maxim Monastirsky Date: Mon, 20 Jan 2014 10:17:05 +0200 Subject: [PATCH] related: fdo#73682 Introduce HTML detection service Change-Id: I66bb579019ce8411b821c623955a454fd81cf811 Reviewed-on: https://gerrit.libreoffice.org/7600 Reviewed-by: Kohei Yoshida Tested-by: Kohei Yoshida --- Repository.mk | 1 + filter/Library_htmlfd.mk | 36 +++ filter/Module_filter.mk | 1 + .../config/fragments/types/generic_HTML.xcu | 2 +- filter/source/htmlfilterdetect/fdcomp.cxx | 36 +++ .../source/htmlfilterdetect/filterdetect.cxx | 232 ++++++++++++++++++ .../source/htmlfilterdetect/filterdetect.hxx | 64 +++++ .../source/htmlfilterdetect/htmlfd.component | 15 ++ postprocess/Rdb_services.mk | 1 + .../gbuild/extensions/pre_MergedLibsList.mk | 1 + 10 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 filter/Library_htmlfd.mk create mode 100644 filter/source/htmlfilterdetect/fdcomp.cxx create mode 100644 filter/source/htmlfilterdetect/filterdetect.cxx create mode 100644 filter/source/htmlfilterdetect/filterdetect.hxx create mode 100644 filter/source/htmlfilterdetect/htmlfd.component diff --git a/Repository.mk b/Repository.mk index 6c4d488a0d64..7066001679bc 100644 --- a/Repository.mk +++ b/Repository.mk @@ -270,6 +270,7 @@ $(eval $(call gb_Helper_register_libraries_for_install,OOOLIBS,ooo, \ $(if $(ENABLE_DIRECTX),gdipluscanvas) \ guesslang \ $(if $(filter DESKTOP,$(BUILD_TYPE)),helplinker) \ + htmlfd \ i18npool \ i18nsearch \ hyphen \ diff --git a/filter/Library_htmlfd.mk b/filter/Library_htmlfd.mk new file mode 100644 index 000000000000..a147509e899e --- /dev/null +++ b/filter/Library_htmlfd.mk @@ -0,0 +1,36 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +#************************************************************************* +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +#************************************************************************* + +$(eval $(call gb_Library_Library,htmlfd)) + +$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd)) + +$(eval $(call gb_Library_use_external,xmlfd,boost_headers)) + +$(eval $(call gb_Library_use_sdk_api,htmlfd)) + +$(eval $(call gb_Library_use_libraries,htmlfd,\ + ucbhelper \ + cppuhelper \ + cppu \ + sal \ + tl \ + utl \ + svt \ + $(gb_UWINAPI) \ +)) + +$(eval $(call gb_Library_add_exception_objects,htmlfd,\ + filter/source/htmlfilterdetect/fdcomp \ + filter/source/htmlfilterdetect/filterdetect \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk index 403184a93feb..58307b42a7e9 100644 --- a/filter/Module_filter.mk +++ b/filter/Module_filter.mk @@ -34,6 +34,7 @@ $(eval $(call gb_Module_add_targets,filter,\ Library_exp) \ Library_filterconfig \ Library_flash \ + Library_htmlfd \ Library_icd \ Library_icg \ Library_idx \ diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu index ede6d2b8fefb..58ffedc85f1e 100644 --- a/filter/source/config/fragments/types/generic_HTML.xcu +++ b/filter/source/config/fragments/types/generic_HTML.xcu @@ -16,7 +16,7 @@ * the License at http://www.apache.org/licenses/LICENSE-2.0 . --> - com.sun.star.text.FormatDetector + com.sun.star.comp.filters.HtmlFilterDetect private:factory/swriter/web* html htm text/html diff --git a/filter/source/htmlfilterdetect/fdcomp.cxx b/filter/source/htmlfilterdetect/fdcomp.cxx new file mode 100644 index 000000000000..40360e923c33 --- /dev/null +++ b/filter/source/htmlfilterdetect/fdcomp.cxx @@ -0,0 +1,36 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include "filterdetect.hxx" + +namespace { + +static cppu::ImplementationEntry const services[] = { + { &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName, + &HtmlFilterDetect_getSupportedServiceNames, + &cppu::createSingleComponentFactory, 0, 0 }, + { 0, 0, 0, 0, 0, 0 } +}; + +} + +extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory( + char const * pImplName, void * pServiceManager, void * pRegistryKey) +{ + return cppu::component_getFactoryHelper( + pImplName, pServiceManager, pRegistryKey, services); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.cxx b/filter/source/htmlfilterdetect/filterdetect.cxx new file mode 100644 index 000000000000..140912d37379 --- /dev/null +++ b/filter/source/htmlfilterdetect/filterdetect.cxx @@ -0,0 +1,232 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "filterdetect.hxx" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +using com::sun::star::io::XInputStream; +using com::sun::star::uno::Sequence; +using com::sun::star::uno::Reference; +using com::sun::star::uno::Any; +using com::sun::star::uno::XComponentContext; +using com::sun::star::uno::XInterface; +using com::sun::star::uno::Exception; +using com::sun::star::uno::RuntimeException; +using com::sun::star::ucb::XCommandEnvironment; + +using namespace com::sun::star; +using namespace com::sun::star::beans; + +namespace { + +enum DetectPhase { + BeforeTag, + TagOpened, + InTagName +}; + +bool isHTMLStream(const OString& aStreamHeader) +{ + const char* pHeader = aStreamHeader.getStr(); + const int nLength = aStreamHeader.getLength(); + int nStartOfTagIndex = 0; + int i = 0; + + DetectPhase dp = BeforeTag; + + for ( i = 0; i < nLength; ++i, ++pHeader ) + { + char c = *pHeader; + if ( c == ' ' || c == '\n' || c == '\t' ) + { + if ( dp == TagOpened ) + return false; // Invalid: Should start with a tag name + else if ( dp == InTagName ) + break; // End of tag name reached + } + else if ( c == '<' ) + { + if ( dp == BeforeTag ) + dp = TagOpened; + else + return false; // Invalid: Nested '<' + } + else if ( c == '>' ) + { + if ( dp == InTagName ) + break; // End of tag name reached + else + return false; // Invalid: Empty tag or before '<' + } + else if ( c == '!' ) + { + if ( i == 1 && dp == TagOpened ) + return true; // "& lDescriptor) + throw (RuntimeException) +{ + OUString sUrl; + OUString sDocService; + OString resultString; + Reference xInStream; + + const PropertyValue *pValue = lDescriptor.getConstArray(); + sal_Int32 nLength = lDescriptor.getLength(); + sal_Int32 location = nLength; + + for ( sal_Int32 i = 0; i < nLength; ++i ) + { + if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() ) + pValue[i].Value >>= sUrl; + else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() ) + pValue[i].Value >>= xInStream; + else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() ) + { + location = i; + pValue[i].Value >>= sDocService; + } + } + + try + { + if ( !xInStream.is() ) + { + ucbhelper::Content aContent( sUrl, Reference(), mxCtx ); + xInStream = aContent.openStream(); + if ( !xInStream.is() ) + return OUString(); + } + + boost::scoped_ptr pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); + if ( !pInStream || pInStream->GetError() ) + return OUString(); + + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + sal_Size nUniPos = pInStream->Tell(); + + const sal_uInt16 nSize = 4096; + + if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode + resultString = read_uInt8s_ToOString( *pInStream, nSize ); + else // UTF-16 + resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); + + if ( isHTMLStream( resultString.toAsciiLowerCase() ) ) + { + // Some Apps/Web services use ".xls" extension to indicate that + // the given file should be opened by a spreadsheet software + if ( sDocService.isEmpty() ) + { + INetURLObject aParser( sUrl ); + OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET ); + aExt = aExt.toAsciiLowerCase(); + + if ( aExt == "xls" ) + { + if ( location == lDescriptor.getLength() ) + { + lDescriptor.realloc( location + 1 ); + lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE(); + } + lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" ); + } + } + return OUString( "generic_HTML" ); + } + } + catch (const Exception &) + { + OSL_FAIL( "An Exception occurred while opening File stream" ); + } + + return OUString(); // Failed +} + +// XInitialization + +void SAL_CALL HtmlFilterDetect::initialize(const Sequence& /*aArguments*/) + throw (Exception, RuntimeException) +{ +} + +OUString HtmlFilterDetect_getImplementationName() +{ + return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" ); +} + +Sequence HtmlFilterDetect_getSupportedServiceNames() +{ + Sequence aRet(2); + OUString* pArray = aRet.getArray(); + pArray[0] = "com.sun.star.document.ExtendedTypeDetection"; + pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect"; + return aRet; +} + +Reference HtmlFilterDetect_createInstance(const Reference& rCtx) +{ + return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx ); +} + +// XServiceInfo + +OUString SAL_CALL HtmlFilterDetect::getImplementationName() + throw (RuntimeException) +{ + return HtmlFilterDetect_getImplementationName(); +} + +sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName) + throw (RuntimeException) +{ + return cppu::supportsService( this, rServiceName ); +} + +Sequence SAL_CALL HtmlFilterDetect::getSupportedServiceNames() + throw (RuntimeException) +{ + return HtmlFilterDetect_getSupportedServiceNames(); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.hxx b/filter/source/htmlfilterdetect/filterdetect.hxx new file mode 100644 index 000000000000..631d4d3715e5 --- /dev/null +++ b/filter/source/htmlfilterdetect/filterdetect.hxx @@ -0,0 +1,64 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX +#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX + +#include +#include +#include +#include + +#include + +class HtmlFilterDetect : public cppu::WeakImplHelper3< + com::sun::star::document::XExtendedFilterDetection, + com::sun::star::lang::XInitialization, + com::sun::star::lang::XServiceInfo> +{ + com::sun::star::uno::Reference mxCtx; + +public: + + HtmlFilterDetect(const com::sun::star::uno::Reference& xCtx) : + mxCtx(xCtx) {} + virtual ~HtmlFilterDetect() {} + + // XExtendedFilterDetection + + virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence& lDescriptor) + throw (com::sun::star::uno::RuntimeException); + + // XInitialization + + virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence& aArguments) + throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException); + + // XServiceInfo + + virtual OUString SAL_CALL getImplementationName() + throw (com::sun::star::uno::RuntimeException); + + virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName) + throw (com::sun::star::uno::RuntimeException); + + virtual com::sun::star::uno::Sequence SAL_CALL getSupportedServiceNames() + throw (com::sun::star::uno::RuntimeException); +}; + +OUString HtmlFilterDetect_getImplementationName(); + +com::sun::star::uno::Sequence HtmlFilterDetect_getSupportedServiceNames(); + +com::sun::star::uno::Reference +HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference& rCtx); + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/htmlfd.component b/filter/source/htmlfilterdetect/htmlfd.component new file mode 100644 index 000000000000..32c41b8bef26 --- /dev/null +++ b/filter/source/htmlfilterdetect/htmlfd.component @@ -0,0 +1,15 @@ + + + + + + + + diff --git a/postprocess/Rdb_services.mk b/postprocess/Rdb_services.mk index cd8e3c92bae4..b0c8a10d29af 100755 --- a/postprocess/Rdb_services.mk +++ b/postprocess/Rdb_services.mk @@ -29,6 +29,7 @@ $(eval $(call gb_Rdb_add_components,services,\ filter/source/config/cache/filterconfig1 \ filter/source/flash/flash \ filter/source/graphic/graphicfilter \ + filter/source/htmlfilterdetect/htmlfd \ filter/source/msfilter/msfilter \ filter/source/odfflatxml/odfflatxml \ filter/source/pdf/pdffilter \ diff --git a/solenv/gbuild/extensions/pre_MergedLibsList.mk b/solenv/gbuild/extensions/pre_MergedLibsList.mk index 9cc207915e11..ba7ad86aeaff 100644 --- a/solenv/gbuild/extensions/pre_MergedLibsList.mk +++ b/solenv/gbuild/extensions/pre_MergedLibsList.mk @@ -46,6 +46,7 @@ gb_EXTRAMERGEDLIBS := \ graphicfilter \ guesslang \ $(if $(ENABLE_JAVA),hsqldb) \ + htmlfd \ hyphen \ icd \ icg \