pdfium: add support for reading the structure tree

+ add test for reading the tree

Change-Id: I2f0e9d1852d20b3aa20ec0bcdd3ebc65370d15dd
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180124
Tested-by: Jenkins
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
This commit is contained in:
Tomaž Vajngerl
2025-01-10 23:26:03 +09:00
committed by Tomaž Vajngerl
parent 899d87a844
commit c22cb6f2a5
5 changed files with 342 additions and 0 deletions

View File

@@ -183,6 +183,34 @@ public:
virtual basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) = 0;
};
class VCL_DLLPUBLIC PDFiumStructureElement
{
public:
virtual ~PDFiumStructureElement() = default;
virtual OUString getAltText() = 0;
virtual OUString getActualText() = 0;
virtual OUString getID() = 0;
virtual OUString getLang() = 0;
virtual OUString getTitle() = 0;
virtual OUString getType() = 0;
virtual OUString getObjectType() = 0;
virtual int getNumberOfChildren() = 0;
virtual int getChildMarkedContentID(int nIndex) = 0;
virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
virtual std::unique_ptr<PDFiumStructureElement> getParent() = 0;
};
class VCL_DLLPUBLIC PDFiumStructureTree
{
public:
virtual ~PDFiumStructureTree() = default;
virtual int getNumberOfChildren() = 0;
virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
};
class VCL_DLLPUBLIC PDFiumPage
{
public:
@@ -197,6 +225,7 @@ public:
virtual std::unique_ptr<PDFiumAnnotation> getAnnotation(int nIndex) = 0;
virtual std::unique_ptr<PDFiumTextPage> getTextPage() = 0;
virtual std::unique_ptr<PDFiumStructureTree> getStructureTree() = 0;
/// Get bitmap checksum of the page, without annotations/commenting.
virtual BitmapChecksum getChecksum(int nMDPPerm) = 0;

View File

@@ -486,6 +486,151 @@ CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testTools)
CPPUNIT_ASSERT_EQUAL(false, bool(aDateTime.IsUTC));
}
CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testStructureTree)
{
OUString aURL = getFullUrl(u"StructureTreeExampleDocument.pdf");
SvFileStream aStream(aURL, StreamMode::READ);
GraphicFilter& rGraphicFilter = GraphicFilter::GetGraphicFilter();
Graphic aGraphic = rGraphicFilter.ImportUnloadedGraphic(aStream);
auto pVectorGraphicData = aGraphic.getVectorGraphicData();
CPPUNIT_ASSERT(pVectorGraphicData);
CPPUNIT_ASSERT_EQUAL(VectorGraphicDataType::Pdf, pVectorGraphicData->getType());
auto& rDataContainer = pVectorGraphicData->getBinaryDataContainer();
auto pPdfium = vcl::pdf::PDFiumLibrary::get();
CPPUNIT_ASSERT(pPdfium);
auto pDocument
= pPdfium->openDocument(rDataContainer.getData(), rDataContainer.getSize(), OString());
CPPUNIT_ASSERT(pDocument);
CPPUNIT_ASSERT_EQUAL(1, pDocument->getPageCount());
auto pPage = pDocument->openPage(0);
CPPUNIT_ASSERT(pPage);
auto pTree = pPage->getStructureTree();
CPPUNIT_ASSERT(pTree);
CPPUNIT_ASSERT_EQUAL(1, pTree->getNumberOfChildren());
// Check the structure
{
auto pChildDocument = pTree->getChild(0);
CPPUNIT_ASSERT(pChildDocument);
CPPUNIT_ASSERT_EQUAL(5, pChildDocument->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getAltText());
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getActualText());
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getID());
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getLang());
CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getTitle());
CPPUNIT_ASSERT_EQUAL(u"Document"_ustr, pChildDocument->getType());
CPPUNIT_ASSERT_EQUAL(u"StructElem"_ustr, pChildDocument->getObjectType());
{
auto pThis = pChildDocument->getChild(0);
CPPUNIT_ASSERT(pThis);
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(0, pThis->getChildMarkedContentID(0));
}
{
auto pThis = pChildDocument->getChild(1);
CPPUNIT_ASSERT(pThis);
CPPUNIT_ASSERT_EQUAL(u"H1"_ustr, pThis->getType());
CPPUNIT_ASSERT_EQUAL(2, pThis->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(1, pThis->getChildMarkedContentID(0));
CPPUNIT_ASSERT_EQUAL(2, pThis->getChildMarkedContentID(1));
}
{
auto pThis = pChildDocument->getChild(2);
CPPUNIT_ASSERT(pThis);
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
CPPUNIT_ASSERT_EQUAL(13, pThis->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(3, pThis->getChildMarkedContentID(0));
{
auto pChild = pThis->getChild(1);
CPPUNIT_ASSERT_EQUAL(u"Code"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(4, pChild->getChildMarkedContentID(0));
// Check getParent
auto pThis2 = pChild->getParent();
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis2->getType());
CPPUNIT_ASSERT_EQUAL(13, pThis2->getNumberOfChildren());
}
CPPUNIT_ASSERT_EQUAL(5, pThis->getChildMarkedContentID(2));
CPPUNIT_ASSERT_EQUAL(6, pThis->getChildMarkedContentID(3));
{
auto pChild = pThis->getChild(4);
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(7, pChild->getChildMarkedContentID(0));
}
CPPUNIT_ASSERT_EQUAL(8, pThis->getChildMarkedContentID(5));
CPPUNIT_ASSERT_EQUAL(9, pThis->getChildMarkedContentID(6));
{
auto pChild = pThis->getChild(7);
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(10, pChild->getChildMarkedContentID(0));
}
CPPUNIT_ASSERT_EQUAL(11, pThis->getChildMarkedContentID(8));
CPPUNIT_ASSERT_EQUAL(12, pThis->getChildMarkedContentID(9));
{
auto pChild = pThis->getChild(10);
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(13, pChild->getChildMarkedContentID(0));
}
CPPUNIT_ASSERT_EQUAL(14, pThis->getChildMarkedContentID(11));
{
auto pChild = pThis->getChild(12);
CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(15, pChild->getChildMarkedContentID(0));
}
}
{
auto pThis = pChildDocument->getChild(3);
CPPUNIT_ASSERT(pThis);
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
CPPUNIT_ASSERT_EQUAL(4, pThis->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(16, pThis->getChildMarkedContentID(0));
{
auto pChild = pThis->getChild(1);
CPPUNIT_ASSERT_EQUAL(u"Quote"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(17, pChild->getChildMarkedContentID(0));
}
CPPUNIT_ASSERT_EQUAL(18, pThis->getChildMarkedContentID(2));
{
auto pChild = pThis->getChild(3);
// Rectangle
CPPUNIT_ASSERT_EQUAL(u"Div"_ustr, pChild->getType());
CPPUNIT_ASSERT_EQUAL(u"Only Text! - The Alt Text!"_ustr, pChild->getAltText());
CPPUNIT_ASSERT_EQUAL(20, pChild->getChildMarkedContentID(0));
{
// Text in rectangle
auto pRectangleElement = pChild->getChild(1);
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pRectangleElement->getType());
CPPUNIT_ASSERT_EQUAL(21, pRectangleElement->getChildMarkedContentID(0));
}
}
}
{
auto pThis = pChildDocument->getChild(4);
CPPUNIT_ASSERT(pThis);
CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
CPPUNIT_ASSERT_EQUAL(19, pThis->getChildMarkedContentID(0));
}
{
auto pThis = pChildDocument->getChild(5);
CPPUNIT_ASSERT(!pThis);
}
}
}
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Binary file not shown.

Binary file not shown.

View File

@@ -21,6 +21,7 @@
#include <fpdf_signature.h>
#include <fpdf_formfill.h>
#include <fpdf_attachment.h>
#include <fpdf_structtree.h>
#include <osl/endian.h>
#include <vcl/bitmap.hxx>
@@ -338,6 +339,47 @@ public:
int getOptionCount(PDFiumDocument* pDoc) override;
};
class PDFiumStructureElementImpl final : public PDFiumStructureElement
{
private:
FPDF_STRUCTELEMENT mpStructureElement;
PDFiumStructureElementImpl(const PDFiumStructureElementImpl&) = delete;
PDFiumStructureElementImpl& operator=(const PDFiumStructureElementImpl&) = delete;
public:
PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement);
OUString getAltText() override;
OUString getActualText() override;
OUString getID() override;
OUString getLang() override;
OUString getTitle() override;
OUString getType() override;
OUString getObjectType() override;
int getNumberOfChildren() override;
int getChildMarkedContentID(int nIndex) override;
std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
std::unique_ptr<PDFiumStructureElement> getParent() override;
};
class PDFiumStructureTreeImpl final : public PDFiumStructureTree
{
private:
FPDF_STRUCTTREE mpStructureTree;
PDFiumStructureTreeImpl(const PDFiumStructureTreeImpl&) = delete;
PDFiumStructureTreeImpl& operator=(const PDFiumStructureTreeImpl&) = delete;
public:
PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree);
~PDFiumStructureTreeImpl();
int getNumberOfChildren() override;
std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
};
class PDFiumPageObjectImpl final : public PDFiumPageObject
{
private:
@@ -463,6 +505,8 @@ public:
std::unique_ptr<PDFiumTextPage> getTextPage() override;
std::unique_ptr<PDFiumStructureTree> getStructureTree() override;
BitmapChecksum getChecksum(int nMDPPerm) override;
double getWidth() override;
@@ -974,6 +1018,17 @@ std::unique_ptr<PDFiumTextPage> PDFiumPageImpl::getTextPage()
return pPDFiumTextPage;
}
std::unique_ptr<PDFiumStructureTree> PDFiumPageImpl::getStructureTree()
{
std::unique_ptr<PDFiumStructureTree> pPDFiumStructureTree;
FPDF_STRUCTTREE pStructTree = FPDF_StructTree_GetForPage(mpPage);
if (pStructTree)
{
pPDFiumStructureTree = std::make_unique<PDFiumStructureTreeImpl>(pStructTree);
}
return pPDFiumStructureTree;
}
bool PDFiumPageImpl::hasLinks()
{
// This could be a full iterator, but at the moment we just determine if the list is empty or
@@ -1610,6 +1665,119 @@ std::unique_ptr<PDFiumPageObject> PDFiumAnnotationImpl::getObject(int nIndex)
return pPDFiumPageObject;
}
PDFiumStructureElementImpl::PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement)
: mpStructureElement(pStructureElement)
{
}
OUString PDFiumStructureElementImpl::getAltText()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetAltText(mpStructureElement, buffer, length);
});
}
OUString PDFiumStructureElementImpl::getActualText()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetActualText(mpStructureElement, buffer, length);
});
}
OUString PDFiumStructureElementImpl::getID()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetID(mpStructureElement, buffer, length);
});
}
OUString PDFiumStructureElementImpl::getLang()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetLang(mpStructureElement, buffer, length);
});
}
OUString PDFiumStructureElementImpl::getType()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetType(mpStructureElement, buffer, length);
});
}
OUString PDFiumStructureElementImpl::getObjectType()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetObjType(mpStructureElement, buffer, length);
});
}
int PDFiumStructureElementImpl::getChildMarkedContentID(int nIndex)
{
return FPDF_StructElement_GetChildMarkedContentID(mpStructureElement, nIndex);
}
OUString PDFiumStructureElementImpl::getTitle()
{
return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
return FPDF_StructElement_GetTitle(mpStructureElement, buffer, length);
});
}
int PDFiumStructureElementImpl::getNumberOfChildren()
{
return FPDF_StructElement_CountChildren(mpStructureElement);
}
std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getChild(int nIndex)
{
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetChildAtIndex(mpStructureElement, nIndex);
if (pElement)
{
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
}
return pPDFiumStructureElement;
}
std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getParent()
{
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetParent(mpStructureElement);
if (pElement)
{
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
}
return pPDFiumStructureElement;
}
PDFiumStructureTreeImpl::PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree)
: mpStructureTree(pStructureTree)
{
}
PDFiumStructureTreeImpl::~PDFiumStructureTreeImpl()
{
if (mpStructureTree)
FPDF_StructTree_Close(mpStructureTree);
}
int PDFiumStructureTreeImpl::getNumberOfChildren()
{
return FPDF_StructTree_CountChildren(mpStructureTree);
}
std::unique_ptr<PDFiumStructureElement> PDFiumStructureTreeImpl::getChild(int nIndex)
{
std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
FPDF_STRUCTELEMENT pElement = FPDF_StructTree_GetChildAtIndex(mpStructureTree, nIndex);
if (pElement)
{
pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement);
}
return pPDFiumStructureElement;
}
PDFiumTextPageImpl::PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage)
: mpTextPage(pTextPage)
{