Skip to content

Instantly share code, notes, and snippets.

@shibayan
Created June 27, 2020 15:25
Show Gist options
  • Save shibayan/8839930e07898cae4de7be91bff9d5f6 to your computer and use it in GitHub Desktop.
Save shibayan/8839930e07898cae4de7be91bff9d5f6 to your computer and use it in GitHub Desktop.
libwebp patch for Windows on ARM (ARM64)
diff --git a/Makefile.vc b/Makefile.vc
index 886f981f..d9b1ba4b 100644
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -11,6 +11,8 @@ LIBWEBPDEMUX_BASENAME = libwebpdemux
ARCH = x86
!ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
ARCH = x64
+!ELSE IF ! [ cl 2>&1 | find "ARM64" > NUL ]
+ARCH = ARM64
!ELSE IF ! [ cl 2>&1 | find "ARM" > NUL ]
ARCH = ARM
!ELSE
diff --git a/src/dec/tree_dec.c b/src/dec/tree_dec.c
index 1c6fdea2..4c9f48b6 100644
--- a/src/dec/tree_dec.c
+++ b/src/dec/tree_dec.c
@@ -15,7 +15,7 @@
#include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE)
-#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else
diff --git a/src/dsp/cost_neon.c b/src/dsp/cost_neon.c
index 8cc8ce58..da1b561e 100644
--- a/src/dsp/cost_neon.c
+++ b/src/dsp/cost_neon.c
@@ -29,7 +29,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(_M_ARM64)
res->last = vmaxvq_u8(masked) - 1;
#else
const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
@@ -43,7 +43,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
--res->last;
-#endif // __aarch64__
+#endif // defined(__aarch64__) || defined(_M_ARM64)
res->coeffs = coeffs;
}
diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
index fa851707..2b387e81 100644
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -1428,7 +1428,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
const uint16_t p2 = vaddlv_u8(A);
sum_top = vdupq_n_u16(p2);
#else
@@ -1511,7 +1511,7 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
const uint16_t p3 = vaddlvq_u8(A);
sum_top = vdupq_n_u16(p3);
#else
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 78fc20a7..6a34e564 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -95,6 +95,11 @@ extern "C" {
#define WEBP_USE_INTRINSICS
#endif
+#if defined(_MSC_VER) && _MSC_VER >= 1927 && defined(_M_ARM64)
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
#if defined(__mips__) && !defined(__mips64) && \
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
#define WEBP_USE_MIPS32
diff --git a/src/utils/bit_reader_utils.c b/src/utils/bit_reader_utils.c
index 857cd609..935f6003 100644
--- a/src/utils/bit_reader_utils.c
+++ b/src/utils/bit_reader_utils.c
@@ -121,7 +121,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
#define VP8L_LOG8_WBITS 4 // Number of bytes needed to store VP8L_WBITS bits.
-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || defined(_M_ARM64) \
defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64)
#define VP8L_USE_FAST_LOAD
diff --git a/src/utils/bit_reader_utils.h b/src/utils/bit_reader_utils.h
index e64156e3..46bd8c81 100644
--- a/src/utils/bit_reader_utils.h
+++ b/src/utils/bit_reader_utils.h
@@ -58,17 +58,17 @@ extern "C" {
// BITS can be any multiple of 8 from 8 to 56 (inclusive).
// Pick values that fit natural register size.
-#if defined(__i386__) || defined(_M_IX86) // x86 32bit
+#if defined(__i386__) || defined(_M_IX86) // x86 32bit
#define BITS 24
-#elif defined(__x86_64__) || defined(_M_X64) // x86 64bit
+#elif defined(__x86_64__) || defined(_M_X64) // x86 64bit
#define BITS 56
-#elif defined(__arm__) || defined(_M_ARM) // ARM
+#elif defined(__arm__) || defined(_M_ARM) // ARM
#define BITS 24
-#elif defined(__aarch64__) // ARM 64bit
+#elif defined(__aarch64__) || defined(_M_ARM64) // ARM 64bit
#define BITS 56
-#elif defined(__mips__) // MIPS
+#elif defined(__mips__) // MIPS
#define BITS 24
-#else // reasonable default
+#else // reasonable default
#define BITS 24
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment