PowerToys/src/modules/MeasureTool/MeasureToolCore/BGRATextureView.h

#pragma once

#include <cinttypes>
#include <wil/resource.h>
#ifdef _M_ARM64
#include <arm64_neon.h.>
#else
#include <emmintrin.h>
#endif
#include <cassert>
#include <limits>

//#define DEBUG_TEXTURE

#if defined(_M_ARM64)

// Adopted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h

using __m128i = int64x2_t;

inline __m128i _mm_cvtsi32_si128(int a)
{
    return vreinterpretq_s64_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
}

inline __m128i _mm_or_si128(__m128i a, __m128i b)
{
    return vreinterpretq_s64_s32(
        vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
}

inline __m128i _mm_subs_epu8(__m128i a, __m128i b)
{
    return vreinterpretq_s64_u8(
        vqsubq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_s64(b)));
}

inline __m128i _mm_sad_epu8(__m128i a, __m128i b)
{
    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b));
    return vreinterpretq_s64_u64(vpaddlq_u32(vpaddlq_u16(t)));
}

inline __m128i _mm_setzero_si128(void)
{
    return vreinterpretq_s64_s32(vdupq_n_s32(0));
}

inline int _mm_cvtsi128_si32(__m128i a)
{
    return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0);
}

inline __m128i _mm_set1_epi16(short w)
{
    return vreinterpretq_s64_s16(vdupq_n_s16(w));
}

inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
{
    return vreinterpretq_s64_u16(
        vcgtq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
}

inline __m128i _mm_cvtepu8_epi16(__m128i a)
{
    uint8x16_t u8x16 = vreinterpretq_u8_s64(a); /* xxxx xxxx HGFE DCBA */
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
    return vreinterpretq_s64_u16(u16x8);
}

inline int64_t _mm_cvtsi128_si64(__m128i a)
{
    return vgetq_lane_s64(a, 0);
}
#endif

inline __m128i distance_epu8(const __m128i a, __m128i b)
{
    return _mm_or_si128(_mm_subs_epu8(a, b),
                        _mm_subs_epu8(b, a));
}

struct BGRATextureView
{
    const uint32_t* pixels = nullptr;
    size_t pitch = {};
    size_t width = {};
    size_t height = {};

    BGRATextureView() = default;

    BGRATextureView(BGRATextureView&& rhs) = default;

    inline uint32_t GetPixel(const size_t x, const size_t y) const
    {
        assert(x < width && x >= 0);
        assert(y < height && y >= 0);
        return pixels[x + pitch * y];
    }

    template<bool perChannel>
    static inline bool PixelsClose(const uint32_t pixel1, const uint32_t pixel2, uint8_t tolerance)
    {
        const __m128i rgba1 = _mm_cvtsi32_si128(pixel1);
        const __m128i rgba2 = _mm_cvtsi32_si128(pixel2);
        const __m128i distances = distance_epu8(rgba1, rgba2);

        // Method 1: Test whether each channel distance is not greater than tolerance
        if constexpr (perChannel)
        {
            const __m128i tolerances = _mm_set1_epi16(tolerance);
            const auto gtResults128 = _mm_cmpgt_epi16(_mm_cvtepu8_epi16(distances), tolerances);
            return _mm_cvtsi128_si64(gtResults128) == 0;
        }
        else
        {
            // Method 2: Test whether sum of all channel differences is smaller than tolerance
            const int32_t score = _mm_cvtsi128_si32(_mm_sad_epu8(distances, _mm_setzero_si128())) & std::numeric_limits<uint8_t>::max();
            return score <= tolerance;
        }
    }

#if defined(DEBUG_TEXTURE)
    void SaveAsBitmap(const char* filename) const;
#endif
};
[New PowerToy] Add Screen Ruler module for measuring screen contents (#19701) * [MeasureTool] initial commit * [chore] clean up needless WindowsTargetPlatformVersion overrides from projects * [MeasureTool] initial implementation * Fix build errors * Update vsconfig for needed Windows 10 SDK versions * fix spellchecker * another spellcheck fix * more spellcheck errors * Fix measurement being off by 1 on both ends * UI fixes * Add feet to crosses * Remove anti-aliasing, as it's creating artifacts * Use pixel tolerance from settings * Tooltip updates * Restore antialiasing to draw the tooltip * remove comment for spell check * Updated icons * Icon updates * Improve measurement accuracy and display * Fix spellchecker * Add less precise drawing on continuous warning * Add setting for turning cross feet on * Swap LMB/RMB for interaction * Uncheck active tool's RadioButton when it exits * activation hotkey toggles UI instead of just launching it * track runner process and exit when it exits * add proj ref * toolbar is interactive during measurements * always open toolbar on the main display * refactor colors * refactor edge detection & overlay ui * refactor overlay ui even more * simplify state structs * multimonitor preparation: eliminate global state * prepare for merge * spelling * proper thread termination + minor fixes * multimonitor: launch tools on all monitors * multimonitor support: track cursor position * spell * fix powertoys! * ScreenSize -> Box * add shadow effect for textbox * spell * fix debug mode * dynamic text box size based on text layout metrics * add mouse wheel to adjust pixel tolerance + per channel detection algorithm setting * spelling * fix per channel distance calculations * update installer deps + spelling * tool activation telemetry * update assets and try to fix build * use × instead of x * allow multiple measurements with bounds tool with shift-click * move #define DEBUG_OVERLAY in an appropriate space * spell-checked * update issue template + refactor text box drawing * implement custom renderer and make × semiopaque * spelling * pass dpiScale to x renderer * add sse2neon license * update OOBE * move license to NOTICE * appropriate module preview image * localization for AutomationPeer * increase default pixel tolerance from 5 to 30 * add PowerToys.MeasureToolUI.exe to bugreport * explicitly set texture dims * clarify continuous capture description * fix a real spelling error! * cleanup * clean up x2 * debug texture * fix texture access * fix saveasbitmap * improve sum of all channel diffs method score calc * optimize * ContinuousCapture is enabled by default to avoid confusion * build fix * draw captured screen in a non continuous mode * cast a spell... * merge fix * disable stroboscopic effect * split global/perScreen measure state and minor improvements * spelling * fix comment * primary monitor debug also active for the bounds tool * dpi from rt for custom renderer * add comment * fix off by 1 * make backround convertion success for non continuous mode non-essential * fix spelling * overlay window covers taskbar * fix CI * revert taskbar covering * fix CI * fix ci again * fix 2 * fix ci * CI fix * fix arm ci * cleanup cursor convertion between coordinate spaces * fix spelling * Fix signing * Fix MeasureToolUI version * Fix core version * fix race condition in system internals which happens during concurrent d3d/d2d resource creation Co-authored-by: Jaime Bernardo <jaime@janeasystems.com> Co-authored-by: Niels Laute <niels.laute@live.nl> 2022-08-27 02:17:20 +03:00			`#pragma once`

			`#include <cinttypes>`
			`#include <wil/resource.h>`
			`#ifdef _M_ARM64`
			`#include <arm64_neon.h.>`
			`#else`
			`#include <emmintrin.h>`
			`#endif`
			`#include <cassert>`
			`#include <limits>`

			`//#define DEBUG_TEXTURE`

			`#if defined(_M_ARM64)`

			`// Adopted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h`

			`using __m128i = int64x2_t;`

			`inline __m128i _mm_cvtsi32_si128(int a)`
			`{`
			`return vreinterpretq_s64_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));`
			`}`

			`inline __m128i _mm_or_si128(__m128i a, __m128i b)`
			`{`
			`return vreinterpretq_s64_s32(`
			`vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));`
			`}`

			`inline __m128i _mm_subs_epu8(__m128i a, __m128i b)`
			`{`
			`return vreinterpretq_s64_u8(`
			`vqsubq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_s64(b)));`
			`}`

			`inline __m128i _mm_sad_epu8(__m128i a, __m128i b)`
			`{`
			`uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b));`
			`return vreinterpretq_s64_u64(vpaddlq_u32(vpaddlq_u16(t)));`
			`}`

			`inline __m128i _mm_setzero_si128(void)`
			`{`
			`return vreinterpretq_s64_s32(vdupq_n_s32(0));`
			`}`

			`inline int _mm_cvtsi128_si32(__m128i a)`
			`{`
			`return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0);`
			`}`

			`inline __m128i _mm_set1_epi16(short w)`
			`{`
			`return vreinterpretq_s64_s16(vdupq_n_s16(w));`
			`}`

			`inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)`
			`{`
			`return vreinterpretq_s64_u16(`
			`vcgtq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));`
			`}`

			`inline __m128i _mm_cvtepu8_epi16(__m128i a)`
			`{`
			`uint8x16_t u8x16 = vreinterpretq_u8_s64(a); /* xxxx xxxx HGFE DCBA */`
			`uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */`
			`return vreinterpretq_s64_u16(u16x8);`
			`}`

			`inline int64_t _mm_cvtsi128_si64(__m128i a)`
			`{`
			`return vgetq_lane_s64(a, 0);`
			`}`
			`#endif`

			`inline __m128i distance_epu8(const __m128i a, __m128i b)`
			`{`
			`return _mm_or_si128(_mm_subs_epu8(a, b),`
			`_mm_subs_epu8(b, a));`
			`}`

			`struct BGRATextureView`
			`{`
			`const uint32_t* pixels = nullptr;`
			`size_t pitch = {};`
			`size_t width = {};`
			`size_t height = {};`

			`BGRATextureView() = default;`

			`BGRATextureView(BGRATextureView&& rhs) = default;`

			`inline uint32_t GetPixel(const size_t x, const size_t y) const`
			`{`
			`assert(x < width && x >= 0);`
			`assert(y < height && y >= 0);`
			`return pixels[x + pitch * y];`
			`}`

			`template<bool perChannel>`
			`static inline bool PixelsClose(const uint32_t pixel1, const uint32_t pixel2, uint8_t tolerance)`
			`{`
			`const __m128i rgba1 = _mm_cvtsi32_si128(pixel1);`
			`const __m128i rgba2 = _mm_cvtsi32_si128(pixel2);`
			`const __m128i distances = distance_epu8(rgba1, rgba2);`

			`// Method 1: Test whether each channel distance is not greater than tolerance`
			`if constexpr (perChannel)`
			`{`
			`const __m128i tolerances = _mm_set1_epi16(tolerance);`
			`const auto gtResults128 = _mm_cmpgt_epi16(_mm_cvtepu8_epi16(distances), tolerances);`
			`return _mm_cvtsi128_si64(gtResults128) == 0;`
			`}`
			`else`
			`{`
			`// Method 2: Test whether sum of all channel differences is smaller than tolerance`
			`const int32_t score = _mm_cvtsi128_si32(_mm_sad_epu8(distances, _mm_setzero_si128())) & std::numeric_limits<uint8_t>::max();`
			`return score <= tolerance;`
			`}`
			`}`

			`#if defined(DEBUG_TEXTURE)`
			`void SaveAsBitmap(const char* filename) const;`
			`#endif`
			`};`