[FFmpeg-devel] [PATCH] avcodec/aarch64: Access externs via GOT with PIC

Triang3l Sun, 10 Jul 2022 12:16:21 -0700

Android, starting from version 6 (API level 23, from the year 2015),requires all shared libraries to be position-independent, and refuses toload shared libraries (which are the most common native binary type onAndroid as in Android applications, native code is placed in sharedlibraries accessed via the Java Native Interface) containing dynamicrelocations.

To support PIC, all AArch64 assembly code in FFmpeg uses the `movrel`macro to obtain addresses of labels, such as lookup table constants, ina way that with CONFIG_PIC being 1, PC-relative addresses of labels arecomputed via the `adrp` instruction. This approach, however, is suitableonly for labels defined in the same object file. For `adrp` to workdirectly between object files, the linker has to perform a textrelocation. And in my scenario (libavcodec being a static library linkedto a shared library, though I'm not sure if this is relevant), thisresulted in the following LLVM linker errors, making my application notbuildable for Android:

"relocation R_AARCH64_ADR_PREL_PG_HI21 cannot be used against symbolff_cos_32; recompile with -fPIC""can't create dynamic relocation R_AARCH64_ADD_ABS_LO12_NC againstsymbol: ff_cos_32 in readonly segment; recompile object files with -fPICor pass '-Wl,-z,notext' to allow text relocations in the output"

This commit brings the solution that is already implemented in FFmpeg onAArch32 to AArch64 - a separate macro, `movrelx`, which emitsinstructions for computing the address of an external label through theGlobal Object Table (GOT).

The same targets as `movrel` is implemented for are covered by thiscommit. For Apple targets, the instruction sequence is the one that isgenerated for referencing an extern variable in C code by Clang for the`aarch64-apple-darwin` target triple. For other targets (Linux), this isthe sequence emitted by Clang for the `aarch64-unknown-linux-gnu`target, by GCC, and specified in the "Assembly expressions" section ofthe Arm Compiler Reference Guide. The hardware-assisted AddressSanitizerhas no effect on the sequence - Clang generates the same `:got:` and`:got_lo12:` addressing regardless of whether `-fsanitize=hwaddress` isused. Windows has no concept of PIC, and Windows builds should be donewith CONFIG_PIC being 0, so it doesn't need to be handled.

The literal offset, however, is always applied using a separate `add` or`sub` instruction as the actual address is loaded indirectly for anextern object that is the whole lookup table itself. The only placewhere the offset is currently used with `movrelx` is VP9, with theoffset being 256 or 512 bytes there. Unfortunately, that offset can't bemoved to the positive immediate offset encoded in load/storeinstructions there without major restructuring, as the actual memoryaccesses are performed in a function that is common to different offsetvalues, with the offset being pre-applied to one of its argumentsinstead. Without PIC though, `movrelx` is implemented exactly the sameas `movrel`, with the offset applied directly to the `ldr` literal, sothe non-PIC path is unaffected by this change.

Testing was performed on my local build setup for Android based onndk-build. Two things were tested:

- Regression testing was done using the `libavcodec/tests/fft.c` test.`libavcodec` was built as a static library, and the test was built as anative executable (which, unlike shared libraries, isn't required to beposition-independent). Both the executable without the changes and theexecutable with the new code were launched on a physical AArch64 deviceusing Termux. As the length of the instruction sequences for `movrel`and `movrelx` without the offset is the same, comparing the two binariesin a diff tool has shown the expected 13 differences in the code - 12 in`fftN_neon` for different transform sizes, and 1 in `ff_fft_calc_neon`.The results for the FFT test were the same for both executables withdifferent transform size values.

- To check sufficiency and suitability for fixing the original issue,the `fft.c` test was converted into a shared library (with the `main`function renamed), and a proxy executable performing `dlopen` of thelibrary and invoking the main test function from it via `dlsym`. Termuxis built with `targetSdkVersion` 28, so the `dlopen` rule of Android APIlevels 23 and above disallowing dynamic relocations should apply to it.The testing device is running Android 11 (API level 30). The test wasexecuted successfully, meaning that no relocations incompatible with PICare required by libavcodec anymore.


Signed-off-by: Triang3l <trian...@yandex.ru>
---
 libavcodec/aarch64/fft_neon.S         |  4 ++--
 libavcodec/aarch64/sbrdsp_neon.S      |  2 +-
 libavcodec/aarch64/vp9mc_16bpp_neon.S |  4 ++--
 libavcodec/aarch64/vp9mc_neon.S       |  4 ++--
 libavutil/aarch64/asm.S               | 19 +++++++++++++++++++
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index 9ff3f9c526..d52f14626b 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -353,7 +353,7 @@ function fft\n\()_neon, align=6
         sub             x0,  x28, #\n4*2*8
         ldp             x28, x30, [sp], #16
         AARCH64_VALIDATE_LINK_REGISTER
-        movrel          x4,  X(ff_cos_\n)
+        movrelx         x4,  X(ff_cos_\n)
         mov             x2,  #\n4>>1
         b               fft_pass_neon
 endfunc
@@ -384,7 +384,7 @@ function ff_fft_calc_neon, export=1
         movrel          x12, pmmp
         ldr             x3,  [x3, x2, lsl #3]
         movrel          x13, mppm
-        movrel          x14, X(ff_cos_16)
+        movrelx         x14, X(ff_cos_16)
         ld1             {v31.16b}, [x11]
         mov             x0,  x1
         ld1             {v29.4s},  [x12]         // pmmp

diff --git a/libavcodec/aarch64/sbrdsp_neon.Sb/libavcodec/aarch64/sbrdsp_neon.S

index d23717e760..bbbc3811bd 100644
--- a/libavcodec/aarch64/sbrdsp_neon.S
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -273,7 +273,7 @@ endfunc
 .macro apply_noise_common
         sxtw            x3, w3
         sxtw            x5, w5
-        movrel          x7, X(ff_sbr_noise_table)
+        movrelx         x7, X(ff_sbr_noise_table)
         add             x3, x3, #1
 1:      and             x3, x3, #0x1ff
         add             x8, x7, x3, lsl #3

diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.Sb/libavcodec/aarch64/vp9mc_16bpp_neon.S

index 53b372c262..ed472eb144 100644
--- a/libavcodec/aarch64/vp9mc_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -287,7 +287,7 @@ do_8tap_h_size 16
 .macro do_8tap_h_func type, filter, offset, size, bpp
 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
         mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
-        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        movrelx         x6,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w5,  #8
         add             x9,  x6,  w5, uxtw #4
         mov             x5,  #2*\size
@@ -574,7 +574,7 @@ do_8tap_4v avg
 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
         uxtw            x4,  w4
         mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
-        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        movrelx         x5,  X(ff_vp9_subpel_filters), 256*\offset
         add             x6,  x5,  w6, uxtw #4
         mov             x5,  #\size
 .if \size >= 8

diff --git a/libavcodec/aarch64/vp9mc_neon.Sb/libavcodec/aarch64/vp9mc_neon.S

index abf2bae9db..48460ae485 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -353,7 +353,7 @@ do_8tap_h_size 16

 .macro do_8tap_h_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
-        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        movrelx         x6,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w5,  #8
         add             x9,  x6,  w5, uxtw #4
         mov             x5,  #\size
@@ -627,7 +627,7 @@ do_8tap_4v avg, 4, 3
 .macro do_8tap_v_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
         uxtw            x4,  w4
-        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        movrelx         x5,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w6,  #8
         add             x6,  x5,  w6, uxtw #4
         mov             x5,  #\size
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index a7782415d7..a8207bf38a 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -229,6 +229,25 @@ ELF     .size   \name, . - \name
 #endif
 .endm

+.macro  movrelx rd, val, offset=0
+#if CONFIG_PIC
+#if defined(__APPLE__)
+        adrp            \rd, \val@GOTPAGE
+        ldr             \rd, [\rd, \val@GOTPAGEOFF]
+#else
+        adrp            \rd, :got:\val
+        ldr             \rd, [\rd, :got_lo12:\val]
+#endif
+    .if \offset > 0
+        add             \rd, \rd, \offset
+    .elseif \offset < 0
+        sub             \rd, \rd, -(\offset)
+    .endif
+#else
+        ldr             \rd, =\val+\offset
+#endif
+.endm
+
 #define GLUE(a, b) a ## b
 #define JOIN(a, b) GLUE(a, b)
 #define X(s) JOIN(EXTERN_ASM, s)
--
2.17.0.windows.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] avcodec/aarch64: Access externs via GOT with PIC

Reply via email to