pdfium: add support for reading the structure tree
+ add test for reading the tree Change-Id: I2f0e9d1852d20b3aa20ec0bcdd3ebc65370d15dd Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180124 Tested-by: Jenkins Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
This commit is contained in:
committed by
Tomaž Vajngerl
parent
899d87a844
commit
c22cb6f2a5
@@ -183,6 +183,34 @@ public:
|
||||
virtual basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) = 0;
|
||||
};
|
||||
|
||||
class VCL_DLLPUBLIC PDFiumStructureElement
|
||||
{
|
||||
public:
|
||||
virtual ~PDFiumStructureElement() = default;
|
||||
|
||||
virtual OUString getAltText() = 0;
|
||||
virtual OUString getActualText() = 0;
|
||||
virtual OUString getID() = 0;
|
||||
virtual OUString getLang() = 0;
|
||||
virtual OUString getTitle() = 0;
|
||||
virtual OUString getType() = 0;
|
||||
virtual OUString getObjectType() = 0;
|
||||
|
||||
virtual int getNumberOfChildren() = 0;
|
||||
virtual int getChildMarkedContentID(int nIndex) = 0;
|
||||
virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
|
||||
virtual std::unique_ptr<PDFiumStructureElement> getParent() = 0;
|
||||
};
|
||||
|
||||
class VCL_DLLPUBLIC PDFiumStructureTree
|
||||
{
|
||||
public:
|
||||
virtual ~PDFiumStructureTree() = default;
|
||||
|
||||
virtual int getNumberOfChildren() = 0;
|
||||
virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
|
||||
};
|
||||
|
||||
class VCL_DLLPUBLIC PDFiumPage
|
||||
{
|
||||
public:
|
||||
@@ -197,6 +225,7 @@ public:
|
||||
virtual std::unique_ptr<PDFiumAnnotation> getAnnotation(int nIndex) = 0;
|
||||
|
||||
virtual std::unique_ptr<PDFiumTextPage> getTextPage() = 0;
|
||||
virtual std::unique_ptr<PDFiumStructureTree> getStructureTree() = 0;
|
||||
|
||||
/// Get bitmap checksum of the page, without annotations/commenting.
|
||||
virtual BitmapChecksum getChecksum(int nMDPPerm) = 0;
|
||||
|
@@ -486,6 +486,151 @@ CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testTools)
|
||||
CPPUNIT_ASSERT_EQUAL(false, bool(aDateTime.IsUTC));
|
||||
}
|
||||
|
||||
CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testStructureTree)
|
||||
{
|
||||
OUString aURL = getFullUrl(u"StructureTreeExampleDocument.pdf");
|
||||
SvFileStream aStream(aURL, StreamMode::READ);
|
||||
GraphicFilter& rGraphicFilter = GraphicFilter::GetGraphicFilter();
|
||||
Graphic aGraphic = rGraphicFilter.ImportUnloadedGraphic(aStream);
|
||||
auto pVectorGraphicData = aGraphic.getVectorGraphicData();
|
||||
CPPUNIT_ASSERT(pVectorGraphicData);
|
||||
CPPUNIT_ASSERT_EQUAL(VectorGraphicDataType::Pdf, pVectorGraphicData->getType());
|
||||
auto& rDataContainer = pVectorGraphicData->getBinaryDataContainer();
|
||||
|
||||
auto pPdfium = vcl::pdf::PDFiumLibrary::get();
|
||||
CPPUNIT_ASSERT(pPdfium);
|
||||
|
||||
auto pDocument
|
||||
= pPdfium->openDocument(rDataContainer.getData(), rDataContainer.getSize(), OString());
|
||||
CPPUNIT_ASSERT(pDocument);
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(1, pDocument->getPageCount());
|
||||
|
||||
auto pPage = pDocument->openPage(0);
|
||||
CPPUNIT_ASSERT(pPage);
|
||||
|
||||
auto pTree = pPage->getStructureTree();
|
||||
CPPUNIT_ASSERT(pTree);
|
||||
CPPUNIT_ASSERT_EQUAL(1, pTree->getNumberOfChildren());
|
||||
|
||||
// Check the structure
|
||||
{
|
||||
auto pChildDocument = pTree->getChild(0);
|
||||
CPPUNIT_ASSERT(pChildDocument);
|
||||
CPPUNIT_ASSERT_EQUAL(5, pChildDocument->getNumberOfChildren());
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getAltText());
|
||||
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getActualText());
|
||||
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getID());
|
||||
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getLang());
|
||||
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getTitle());
|
||||
CPPUNIT_ASSERT_EQUAL(u"Document"_ustr, pChildDocument->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(u"StructElem"_ustr, pChildDocument->getObjectType());
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(0);
|
||||
CPPUNIT_ASSERT(pThis);
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
|
||||
CPPUNIT_ASSERT_EQUAL(0, pThis->getChildMarkedContentID(0));
|
||||
}
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(1);
|
||||
CPPUNIT_ASSERT(pThis);
|
||||
CPPUNIT_ASSERT_EQUAL(u"H1"_ustr, pThis->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(2, pThis->getNumberOfChildren());
|
||||
CPPUNIT_ASSERT_EQUAL(1, pThis->getChildMarkedContentID(0));
|
||||
CPPUNIT_ASSERT_EQUAL(2, pThis->getChildMarkedContentID(1));
|
||||
}
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(2);
|
||||
CPPUNIT_ASSERT(pThis);
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(13, pThis->getNumberOfChildren());
|
||||
CPPUNIT_ASSERT_EQUAL(3, pThis->getChildMarkedContentID(0));
|
||||
{
|
||||
auto pChild = pThis->getChild(1);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Code"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(4, pChild->getChildMarkedContentID(0));
|
||||
|
||||
// Check getParent
|
||||
auto pThis2 = pChild->getParent();
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis2->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(13, pThis2->getNumberOfChildren());
|
||||
}
|
||||
CPPUNIT_ASSERT_EQUAL(5, pThis->getChildMarkedContentID(2));
|
||||
CPPUNIT_ASSERT_EQUAL(6, pThis->getChildMarkedContentID(3));
|
||||
{
|
||||
auto pChild = pThis->getChild(4);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(7, pChild->getChildMarkedContentID(0));
|
||||
}
|
||||
CPPUNIT_ASSERT_EQUAL(8, pThis->getChildMarkedContentID(5));
|
||||
CPPUNIT_ASSERT_EQUAL(9, pThis->getChildMarkedContentID(6));
|
||||
{
|
||||
auto pChild = pThis->getChild(7);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(10, pChild->getChildMarkedContentID(0));
|
||||
}
|
||||
CPPUNIT_ASSERT_EQUAL(11, pThis->getChildMarkedContentID(8));
|
||||
CPPUNIT_ASSERT_EQUAL(12, pThis->getChildMarkedContentID(9));
|
||||
{
|
||||
auto pChild = pThis->getChild(10);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(13, pChild->getChildMarkedContentID(0));
|
||||
}
|
||||
CPPUNIT_ASSERT_EQUAL(14, pThis->getChildMarkedContentID(11));
|
||||
{
|
||||
auto pChild = pThis->getChild(12);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(15, pChild->getChildMarkedContentID(0));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(3);
|
||||
CPPUNIT_ASSERT(pThis);
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(4, pThis->getNumberOfChildren());
|
||||
CPPUNIT_ASSERT_EQUAL(16, pThis->getChildMarkedContentID(0));
|
||||
{
|
||||
auto pChild = pThis->getChild(1);
|
||||
CPPUNIT_ASSERT_EQUAL(u"Quote"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(17, pChild->getChildMarkedContentID(0));
|
||||
}
|
||||
CPPUNIT_ASSERT_EQUAL(18, pThis->getChildMarkedContentID(2));
|
||||
{
|
||||
auto pChild = pThis->getChild(3);
|
||||
// Rectangle
|
||||
CPPUNIT_ASSERT_EQUAL(u"Div"_ustr, pChild->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(u"Only Text! - The Alt Text!"_ustr, pChild->getAltText());
|
||||
CPPUNIT_ASSERT_EQUAL(20, pChild->getChildMarkedContentID(0));
|
||||
{
|
||||
// Text in rectangle
|
||||
auto pRectangleElement = pChild->getChild(1);
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pRectangleElement->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(21, pRectangleElement->getChildMarkedContentID(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(4);
|
||||
CPPUNIT_ASSERT(pThis);
|
||||
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
|
||||
CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
|
||||
CPPUNIT_ASSERT_EQUAL(19, pThis->getChildMarkedContentID(0));
|
||||
}
|
||||
|
||||
{
|
||||
auto pThis = pChildDocument->getChild(5);
|
||||
CPPUNIT_ASSERT(!pThis);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CPPUNIT_PLUGIN_IMPLEMENT();
|
||||
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
||||
|
BIN
vcl/qa/cppunit/data/StructureTreeExampleDocument.odt
Normal file
BIN
vcl/qa/cppunit/data/StructureTreeExampleDocument.odt
Normal file
Binary file not shown.
BIN
vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf
Normal file
BIN
vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf
Normal file
Binary file not shown.
@@ -21,6 +21,7 @@
|
||||
#include <fpdf_signature.h>
|
||||
#include <fpdf_formfill.h>
|
||||
#include <fpdf_attachment.h>
|
||||
#include <fpdf_structtree.h>
|
||||
|
||||
#include <osl/endian.h>
|
||||
#include <vcl/bitmap.hxx>
|
||||
@@ -338,6 +339,47 @@ public:
|
||||
int getOptionCount(PDFiumDocument* pDoc) override;
|
||||
};
|
||||
|
||||
class PDFiumStructureElementImpl final : public PDFiumStructureElement
|
||||
{
|
||||
private:
|
||||
FPDF_STRUCTELEMENT mpStructureElement;
|
||||
|
||||
PDFiumStructureElementImpl(const PDFiumStructureElementImpl&) = delete;
|
||||
PDFiumStructureElementImpl& operator=(const PDFiumStructureElementImpl&) = delete;
|
||||
|
||||
public:
|
||||
PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement);
|
||||
|
||||
OUString getAltText() override;
|
||||
OUString getActualText() override;
|
||||
OUString getID() override;
|
||||
OUString getLang() override;
|
||||
OUString getTitle() override;
|
||||
OUString getType() override;
|
||||
OUString getObjectType() override;
|
||||
|
||||
int getNumberOfChildren() override;
|
||||
int getChildMarkedContentID(int nIndex) override;
|
||||
std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
|
||||
std::unique_ptr<PDFiumStructureElement> getParent() override;
|
||||
};
|
||||
|
||||
class PDFiumStructureTreeImpl final : public PDFiumStructureTree
|
||||
{
|
||||
private:
|
||||
FPDF_STRUCTTREE mpStructureTree;
|
||||
|
||||
PDFiumStructureTreeImpl(const PDFiumStructureTreeImpl&) = delete;
|
||||
PDFiumStructureTreeImpl& operator=(const PDFiumStructureTreeImpl&) = delete;
|
||||
|
||||
public:
|
||||
PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree);
|
||||
~PDFiumStructureTreeImpl();
|
||||
|
||||
int getNumberOfChildren() override;
|
||||
std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
|
||||
};
|
||||
|
||||
class PDFiumPageObjectImpl final : public PDFiumPageObject
|
||||
{
|
||||
private:
|
||||
@@ -463,6 +505,8 @@ public:
|
||||
|
||||
std::unique_ptr<PDFiumTextPage> getTextPage() override;
|
||||
|
||||
std::unique_ptr<PDFiumStructureTree> getStructureTree() override;
|
||||
|
||||
BitmapChecksum getChecksum(int nMDPPerm) override;
|
||||
|
||||
double getWidth() override;
|
||||
@@ -974,6 +1018,17 @@ std::unique_ptr<PDFiumTextPage> PDFiumPageImpl::getTextPage()
|
||||
return pPDFiumTextPage;
|
||||
}
|
||||
|
||||
std::unique_ptr<PDFiumStructureTree> PDFiumPageImpl::getStructureTree()
|
||||
{
|
||||
std::unique_ptr<PDFiumStructureTree> pPDFiumStructureTree;
|
||||
FPDF_STRUCTTREE pStructTree = FPDF_StructTree_GetForPage(mpPage);
|
||||
if (pStructTree)
|
||||
{
|
||||
pPDFiumStructureTree = std::make_unique<PDFiumStructureTreeImpl>(pStructTree);
|
||||
}
|
||||
return pPDFiumStructureTree;
|
||||
}
|
||||
|
||||
bool PDFiumPageImpl::hasLinks()
|
||||
{
|
||||
// This could be a full iterator, but at the moment we just determine if the list is empty or
|
||||
@@ -1610,6 +1665,119 @@ std::unique_ptr<PDFiumPageObject> PDFiumAnnotationImpl::getObject(int nIndex)
|
||||
return pPDFiumPageObject;
|
||||
}
|
||||
|
||||
PDFiumStructureElementImpl::PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement)
|
||||
: mpStructureElement(pStructureElement)
|
||||
{
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getAltText()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetAltText(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getActualText()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetActualText(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getID()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetID(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getLang()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetLang(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getType()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetType(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getObjectType()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetObjType(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
int PDFiumStructureElementImpl::getChildMarkedContentID(int nIndex)
|
||||
{
|
||||
return FPDF_StructElement_GetChildMarkedContentID(mpStructureElement, nIndex);
|
||||
}
|
||||
|
||||
OUString PDFiumStructureElementImpl::getTitle()
|
||||
{
|
||||
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
|
||||
return FPDF_StructElement_GetTitle(mpStructureElement, buffer, length);
|
||||
});
|
||||
}
|
||||
|
||||
int PDFiumStructureElementImpl::getNumberOfChildren()
|
||||
{
|
||||
return FPDF_StructElement_CountChildren(mpStructureElement);
|
||||
}
|
||||
|
||||
std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getChild(int nIndex)
|
||||
{
|
||||
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
|
||||
FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetChildAtIndex(mpStructureElement, nIndex);
|
||||
if (pElement)
|
||||
{
|
||||
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
|
||||
}
|
||||
return pPDFiumStructureElement;
|
||||
}
|
||||
|
||||
std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getParent()
|
||||
{
|
||||
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
|
||||
FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetParent(mpStructureElement);
|
||||
if (pElement)
|
||||
{
|
||||
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
|
||||
}
|
||||
return pPDFiumStructureElement;
|
||||
}
|
||||
|
||||
PDFiumStructureTreeImpl::PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree)
|
||||
: mpStructureTree(pStructureTree)
|
||||
{
|
||||
}
|
||||
|
||||
PDFiumStructureTreeImpl::~PDFiumStructureTreeImpl()
|
||||
{
|
||||
if (mpStructureTree)
|
||||
FPDF_StructTree_Close(mpStructureTree);
|
||||
}
|
||||
|
||||
int PDFiumStructureTreeImpl::getNumberOfChildren()
|
||||
{
|
||||
return FPDF_StructTree_CountChildren(mpStructureTree);
|
||||
}
|
||||
|
||||
std::unique_ptr<PDFiumStructureElement> PDFiumStructureTreeImpl::getChild(int nIndex)
|
||||
{
|
||||
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
|
||||
FPDF_STRUCTELEMENT pElement = FPDF_StructTree_GetChildAtIndex(mpStructureTree, nIndex);
|
||||
if (pElement)
|
||||
{
|
||||
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
|
||||
}
|
||||
return pPDFiumStructureElement;
|
||||
}
|
||||
|
||||
PDFiumTextPageImpl::PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage)
|
||||
: mpTextPage(pTextPage)
|
||||
{
|
||||
|
Reference in New Issue
Block a user