inline comment with prefix [MC] At 2021-04-29 03:50:26, "Josh Dekker" <j...@itanimul.li> wrote: >From: Rafal Dabrowa <fatwild...@gmail.com> > >Benchmarked on Apple M1: > >put_hevc_epel_bi_h4_8_c: 69.9 >put_hevc_epel_bi_h4_8_neon: 15.4 >put_hevc_epel_bi_h6_8_c: 137.1 >put_hevc_epel_bi_h6_8_neon: 31.9 >put_hevc_epel_bi_h8_8_c: 124.6 >put_hevc_epel_bi_h8_8_neon: 40.9 >put_hevc_epel_bi_h12_8_c: 331.9 >put_hevc_epel_bi_h12_8_neon: 72.4 >put_hevc_epel_bi_h16_8_c: 383.4 >put_hevc_epel_bi_h16_8_neon: 124.9 >put_hevc_epel_bi_h24_8_c: 771.6 >put_hevc_epel_bi_h24_8_neon: 209.6 >put_hevc_epel_bi_h32_8_c: 1324.4 >put_hevc_epel_bi_h32_8_neon: 389.4 >put_hevc_epel_bi_h48_8_c: 2869.6 >put_hevc_epel_bi_h48_8_neon: 730.1 >put_hevc_epel_bi_h64_8_c: 4992.6 >put_hevc_epel_bi_h64_8_neon: 1490.4 >put_hevc_epel_bi_hv4_8_c: 163.4 >put_hevc_epel_bi_hv4_8_neon: 38.4 >put_hevc_epel_bi_hv6_8_c: 292.4 >put_hevc_epel_bi_hv6_8_neon: 66.4 >put_hevc_epel_bi_hv8_8_c: 375.6 >put_hevc_epel_bi_hv8_8_neon: 62.4 >put_hevc_epel_bi_hv12_8_c: 831.6 >put_hevc_epel_bi_hv12_8_neon: 134.9 >put_hevc_epel_bi_hv16_8_c: 1257.9 >put_hevc_epel_bi_hv16_8_neon: 214.1 >put_hevc_epel_bi_hv24_8_c: 2666.6 >put_hevc_epel_bi_hv24_8_neon: 391.1 >put_hevc_epel_bi_hv32_8_c: 4722.4 >put_hevc_epel_bi_hv32_8_neon: 734.1 >put_hevc_epel_bi_hv48_8_c: 10100.4 >put_hevc_epel_bi_hv48_8_neon: 1570.4 >put_hevc_epel_bi_hv64_8_c: 17613.4 >put_hevc_epel_bi_hv64_8_neon: 2810.6 >put_hevc_epel_bi_v4_8_c: 77.4 >put_hevc_epel_bi_v4_8_neon: 18.6 >put_hevc_epel_bi_v6_8_c: 142.1 >put_hevc_epel_bi_v6_8_neon: 27.1 >put_hevc_epel_bi_v8_8_c: 192.9 >put_hevc_epel_bi_v8_8_neon: 9.1 >put_hevc_epel_bi_v12_8_c: 415.6 >put_hevc_epel_bi_v12_8_neon: 55.6 >put_hevc_epel_bi_v16_8_c: 487.6 >put_hevc_epel_bi_v16_8_neon: 61.9 >put_hevc_epel_bi_v24_8_c: 957.4 >put_hevc_epel_bi_v24_8_neon: 131.1 >put_hevc_epel_bi_v32_8_c: 1540.4 >put_hevc_epel_bi_v32_8_neon: 210.4 >put_hevc_epel_bi_v48_8_c: 3242.9 >put_hevc_epel_bi_v48_8_neon: 465.6 >put_hevc_epel_bi_v64_8_c: 5441.1 >put_hevc_epel_bi_v64_8_neon: 818.1 >put_hevc_epel_h4_8_c: 41.6 >put_hevc_epel_h4_8_neon: 8.4 >put_hevc_epel_h6_8_c: 110.1 >put_hevc_epel_h6_8_neon: 24.4 >put_hevc_epel_h8_8_c: 41.6 >put_hevc_epel_h8_8_neon: 17.6 >put_hevc_epel_h12_8_c: 183.1 >put_hevc_epel_h12_8_neon: 58.1 >put_hevc_epel_h16_8_c: 146.6 >put_hevc_epel_h16_8_neon: 83.4 >put_hevc_epel_h24_8_c: 240.4 >put_hevc_epel_h24_8_neon: 157.1 >put_hevc_epel_h32_8_c: 431.1 >put_hevc_epel_h32_8_neon: 292.1 >put_hevc_epel_h48_8_c: 858.6 >put_hevc_epel_h48_8_neon: 557.4 >put_hevc_epel_h64_8_c: 1536.6 >put_hevc_epel_h64_8_neon: 1116.6 >put_hevc_epel_hv4_8_c: 152.6 >put_hevc_epel_hv4_8_neon: 34.9 >put_hevc_epel_hv6_8_c: 269.6 >put_hevc_epel_hv6_8_neon: 61.6 >put_hevc_epel_hv8_8_c: 307.4 >put_hevc_epel_hv8_8_neon: 76.9 >put_hevc_epel_hv12_8_c: 702.6 >put_hevc_epel_hv12_8_neon: 113.1 >put_hevc_epel_hv16_8_c: 1081.4 >put_hevc_epel_hv16_8_neon: 190.6 >put_hevc_epel_hv24_8_c: 2276.1 >put_hevc_epel_hv24_8_neon: 345.1 >put_hevc_epel_hv32_8_c: 4068.6 >put_hevc_epel_hv32_8_neon: 780.4 >put_hevc_epel_hv48_8_c: 8754.1 >put_hevc_epel_hv48_8_neon: 1394.4 >put_hevc_epel_hv64_8_c: 15402.1 >put_hevc_epel_hv64_8_neon: 2616.6 >put_hevc_epel_uni_hv4_8_c: 142.1 >put_hevc_epel_uni_hv4_8_neon: 46.6 >put_hevc_epel_uni_hv6_8_c: 298.4 >put_hevc_epel_uni_hv6_8_neon: 72.4 >put_hevc_epel_uni_hv8_8_c: 352.9 >put_hevc_epel_uni_hv8_8_neon: 75.1 >put_hevc_epel_uni_hv12_8_c: 776.6 >put_hevc_epel_uni_hv12_8_neon: 125.9 >put_hevc_epel_uni_hv16_8_c: 1216.1 >put_hevc_epel_uni_hv16_8_neon: 199.1 >put_hevc_epel_uni_hv24_8_c: 2577.9 >put_hevc_epel_uni_hv24_8_neon: 386.6 >put_hevc_epel_uni_hv32_8_c: 4554.9 >put_hevc_epel_uni_hv32_8_neon: 710.9 >put_hevc_epel_uni_hv48_8_c: 9869.1 >put_hevc_epel_uni_hv48_8_neon: 1499.4 >put_hevc_epel_uni_hv64_8_c: 17307.1 >put_hevc_epel_uni_hv64_8_neon: 2750.6 >put_hevc_epel_uni_v4_8_c: 59.9 >put_hevc_epel_uni_v4_8_neon: 21.9 >put_hevc_epel_uni_v6_8_c: 136.1 >put_hevc_epel_uni_v6_8_neon: 19.6 >put_hevc_epel_uni_v8_8_c: 222.4 >put_hevc_epel_uni_v8_8_neon: 17.1 >put_hevc_epel_uni_v12_8_c: 481.6 >put_hevc_epel_uni_v12_8_neon: 42.4 >put_hevc_epel_uni_v16_8_c: 424.4 >put_hevc_epel_uni_v16_8_neon: 63.4 >put_hevc_epel_uni_v24_8_c: 1184.1 >put_hevc_epel_uni_v24_8_neon: 109.9 >put_hevc_epel_uni_v32_8_c: 1401.1 >put_hevc_epel_uni_v32_8_neon: 182.9 >put_hevc_epel_uni_v48_8_c: 2933.9 >put_hevc_epel_uni_v48_8_neon: 388.9 >put_hevc_epel_uni_v64_8_c: 5044.9 >put_hevc_epel_uni_v64_8_neon: 701.1 >put_hevc_epel_v4_8_c: 31.9 >put_hevc_epel_v4_8_neon: 13.4 >put_hevc_epel_v6_8_c: 95.1 >put_hevc_epel_v6_8_neon: 16.4 >put_hevc_epel_v8_8_c: 98.9 >put_hevc_epel_v8_8_neon: 26.1 >put_hevc_epel_v12_8_c: 283.9 >put_hevc_epel_v12_8_neon: 36.9 >put_hevc_epel_v16_8_c: 229.6 >put_hevc_epel_v16_8_neon: 41.9 >put_hevc_epel_v24_8_c: 376.4 >put_hevc_epel_v24_8_neon: 90.4 >put_hevc_epel_v32_8_c: 577.4 >put_hevc_epel_v32_8_neon: 188.4 >put_hevc_epel_v48_8_c: 1058.4 >put_hevc_epel_v48_8_neon: 350.6 >put_hevc_epel_v64_8_c: 1647.4 >put_hevc_epel_v64_8_neon: 647.9 >put_hevc_pel_bi_pixels4_8_c: 39.1 >put_hevc_pel_bi_pixels4_8_neon: 36.4 >put_hevc_pel_bi_pixels6_8_c: 78.6 >put_hevc_pel_bi_pixels6_8_neon: 0.-6 >put_hevc_pel_bi_pixels8_8_c: 60.6 >put_hevc_pel_bi_pixels8_8_neon: 14.1 >put_hevc_pel_bi_pixels12_8_c: 186.1 >put_hevc_pel_bi_pixels12_8_neon: 30.4 >put_hevc_pel_bi_pixels16_8_c: 231.9 >put_hevc_pel_bi_pixels16_8_neon: 32.1 >put_hevc_pel_bi_pixels24_8_c: 454.1 >put_hevc_pel_bi_pixels24_8_neon: 70.1 >put_hevc_pel_bi_pixels32_8_c: 774.1 >put_hevc_pel_bi_pixels32_8_neon: 102.1 >put_hevc_pel_bi_pixels48_8_c: 1632.9 >put_hevc_pel_bi_pixels48_8_neon: 220.4 >put_hevc_pel_bi_pixels64_8_c: 2812.9 >put_hevc_pel_bi_pixels64_8_neon: 402.4 >put_hevc_pel_pixels4_8_c: 41.1 >put_hevc_pel_pixels4_8_neon: 6.4 >put_hevc_pel_pixels6_8_c: 45.1 >put_hevc_pel_pixels6_8_neon: 5.4 >put_hevc_pel_pixels8_8_c: 94.6 >put_hevc_pel_pixels8_8_neon: 15.6 >put_hevc_pel_pixels12_8_c: 198.6 >put_hevc_pel_pixels12_8_neon: 15.4 >put_hevc_pel_pixels16_8_c: 87.9 >put_hevc_pel_pixels16_8_neon: 18.1 >put_hevc_pel_pixels24_8_c: 310.6 >put_hevc_pel_pixels24_8_neon: 39.6 >put_hevc_pel_pixels32_8_c: 198.6 >put_hevc_pel_pixels32_8_neon: 78.1 >put_hevc_pel_pixels48_8_c: 372.4 >put_hevc_pel_pixels48_8_neon: 173.1 >put_hevc_pel_pixels64_8_c: 569.1 >put_hevc_pel_pixels64_8_neon: 324.4 >put_hevc_qpel_bi_h4_8_c: 101.4 >put_hevc_qpel_bi_h4_8_neon: 34.6 >put_hevc_qpel_bi_h6_8_c: 270.1 >put_hevc_qpel_bi_h6_8_neon: 61.6 >put_hevc_qpel_bi_h8_8_c: 165.6 >put_hevc_qpel_bi_h8_8_neon: 62.9 >put_hevc_qpel_bi_h12_8_c: 546.4 >put_hevc_qpel_bi_h12_8_neon: 124.1 >put_hevc_qpel_bi_h16_8_c: 536.9 >put_hevc_qpel_bi_h16_8_neon: 178.6 >put_hevc_qpel_bi_h24_8_c: 1151.6 >put_hevc_qpel_bi_h24_8_neon: 316.6 >put_hevc_qpel_bi_h32_8_c: 1981.4 >put_hevc_qpel_bi_h32_8_neon: 575.4 >put_hevc_qpel_bi_h48_8_c: 4336.6 >put_hevc_qpel_bi_h48_8_neon: 1189.6 >put_hevc_qpel_bi_h64_8_c: 7591.6 >put_hevc_qpel_bi_h64_8_neon: 2184.9 >put_hevc_qpel_bi_hv4_8_c: 438.9 >put_hevc_qpel_bi_hv4_8_neon: 97.6 >put_hevc_qpel_bi_hv6_8_c: 829.1 >put_hevc_qpel_bi_hv6_8_neon: 131.4 >put_hevc_qpel_bi_hv8_8_c: 983.9 >put_hevc_qpel_bi_hv8_8_neon: 146.1 >put_hevc_qpel_bi_hv12_8_c: 2050.9 >put_hevc_qpel_bi_hv12_8_neon: 364.6 >put_hevc_qpel_bi_hv16_8_c: 3028.4 >put_hevc_qpel_bi_hv16_8_neon: 432.6 >put_hevc_qpel_bi_hv24_8_c: 6294.9 >put_hevc_qpel_bi_hv24_8_neon: 910.1 >put_hevc_qpel_bi_hv32_8_c: 10583.4 >put_hevc_qpel_bi_hv32_8_neon: 1345.9 >put_hevc_qpel_bi_hv48_8_c: 22412.4 >put_hevc_qpel_bi_hv48_8_neon: 2852.6 >put_hevc_qpel_bi_hv64_8_c: 38653.9 >put_hevc_qpel_bi_hv64_8_neon: 5094.1 >put_hevc_qpel_bi_v4_8_c: 143.9 >put_hevc_qpel_bi_v4_8_neon: 25.9 >put_hevc_qpel_bi_v6_8_c: 296.6 >put_hevc_qpel_bi_v6_8_neon: 35.1 >put_hevc_qpel_bi_v8_8_c: 515.4 >put_hevc_qpel_bi_v8_8_neon: 31.6 >put_hevc_qpel_bi_v12_8_c: 1175.6 >put_hevc_qpel_bi_v12_8_neon: 81.1 >put_hevc_qpel_bi_v16_8_c: 2051.6 >put_hevc_qpel_bi_v16_8_neon: 111.1 >put_hevc_qpel_bi_v24_8_c: 4556.9 >put_hevc_qpel_bi_v24_8_neon: 208.6 >put_hevc_qpel_bi_v32_8_c: 8048.1 >put_hevc_qpel_bi_v32_8_neon: 351.6 >put_hevc_qpel_bi_v48_8_c: 18009.9 >put_hevc_qpel_bi_v48_8_neon: 773.1 >put_hevc_qpel_bi_v64_8_c: 31784.9 >put_hevc_qpel_bi_v64_8_neon: 1370.6 >put_hevc_qpel_h4_8_c: 120.1 >put_hevc_qpel_h4_8_neon: 33.1 >put_hevc_qpel_h6_8_c: 241.6 >put_hevc_qpel_h6_8_neon: 29.1 >put_hevc_qpel_h8_8_c: 70.6 >put_hevc_qpel_h8_8_neon: 52.6 >put_hevc_qpel_h12_8_c: 347.4 >put_hevc_qpel_h12_8_neon: 111.1 >put_hevc_qpel_h16_8_c: 180.4 >put_hevc_qpel_h16_8_neon: 149.9 >put_hevc_qpel_h24_8_c: 333.4 >put_hevc_qpel_h24_8_neon: 289.1 >put_hevc_qpel_h32_8_c: 597.1 >put_hevc_qpel_h32_8_neon: 478.9 >put_hevc_qpel_h48_8_c: 1262.6 >put_hevc_qpel_h48_8_neon: 975.6 >put_hevc_qpel_h64_8_c: 2212.4 >put_hevc_qpel_h64_8_neon: 1831.9 >put_hevc_qpel_hv4_8_c: 430.9 >put_hevc_qpel_hv4_8_neon: 77.4 >put_hevc_qpel_hv6_8_c: 785.9 >put_hevc_qpel_hv6_8_neon: 122.9 >put_hevc_qpel_hv8_8_c: 921.9 >put_hevc_qpel_hv8_8_neon: 150.1 >put_hevc_qpel_hv12_8_c: 1943.4 >put_hevc_qpel_hv12_8_neon: 245.4 >put_hevc_qpel_hv16_8_c: 2886.9 >put_hevc_qpel_hv16_8_neon: 375.4 >put_hevc_qpel_hv24_8_c: 5954.6 >put_hevc_qpel_hv24_8_neon: 711.4 >put_hevc_qpel_hv32_8_c: 9967.1 >put_hevc_qpel_hv32_8_neon: 1161.1 >put_hevc_qpel_hv48_8_c: 21173.1 >put_hevc_qpel_hv48_8_neon: 2593.9 >put_hevc_qpel_hv64_8_c: 37378.1 >put_hevc_qpel_hv64_8_neon: 4470.4 >put_hevc_qpel_uni_h4_8_c: 108.4 >put_hevc_qpel_uni_h4_8_neon: 38.9 >put_hevc_qpel_uni_h6_8_c: 237.9 >put_hevc_qpel_uni_h6_8_neon: 54.6 >put_hevc_qpel_uni_h8_8_c: 432.4 >put_hevc_qpel_uni_h8_8_neon: 64.9 >put_hevc_qpel_uni_h12_8_c: 1019.4 >put_hevc_qpel_uni_h12_8_neon: 116.1 >put_hevc_qpel_uni_h16_8_c: 463.6 >put_hevc_qpel_uni_h16_8_neon: 153.1 >put_hevc_qpel_uni_h24_8_c: 1919.4 >put_hevc_qpel_uni_h24_8_neon: 292.1 >put_hevc_qpel_uni_h32_8_c: 1800.6 >put_hevc_qpel_uni_h32_8_neon: 496.9 >put_hevc_qpel_uni_h48_8_c: 4056.1 >put_hevc_qpel_uni_h48_8_neon: 1071.1 >put_hevc_qpel_uni_h64_8_c: 7149.9 >put_hevc_qpel_uni_h64_8_neon: 1820.6 >put_hevc_qpel_uni_hv4_8_c: 444.6 >put_hevc_qpel_uni_hv4_8_neon: 86.6 >put_hevc_qpel_uni_hv6_8_c: 810.6 >put_hevc_qpel_uni_hv6_8_neon: 121.9 >put_hevc_qpel_uni_hv8_8_c: 949.6 >put_hevc_qpel_uni_hv8_8_neon: 137.6 >put_hevc_qpel_uni_hv12_8_c: 2021.6 >put_hevc_qpel_uni_hv12_8_neon: 261.1 >put_hevc_qpel_uni_hv16_8_c: 3004.6 >put_hevc_qpel_uni_hv16_8_neon: 367.1 >put_hevc_qpel_uni_hv24_8_c: 6204.9 >put_hevc_qpel_uni_hv24_8_neon: 813.1 >put_hevc_qpel_uni_hv32_8_c: 10447.4 >put_hevc_qpel_uni_hv32_8_neon: 1216.4 >put_hevc_qpel_uni_hv48_8_c: 22322.9 >put_hevc_qpel_uni_hv48_8_neon: 2531.6 >put_hevc_qpel_uni_hv64_8_c: 38859.9 >put_hevc_qpel_uni_hv64_8_neon: 4528.9 >put_hevc_qpel_uni_v4_8_c: 124.6 >put_hevc_qpel_uni_v4_8_neon: 33.9 >put_hevc_qpel_uni_v6_8_c: 260.6 >put_hevc_qpel_uni_v6_8_neon: 28.6 >put_hevc_qpel_uni_v8_8_c: 480.4 >put_hevc_qpel_uni_v8_8_neon: 30.4 >put_hevc_qpel_uni_v12_8_c: 1101.4 >put_hevc_qpel_uni_v12_8_neon: 72.1 >put_hevc_qpel_uni_v16_8_c: 720.4 >put_hevc_qpel_uni_v16_8_neon: 87.4 >put_hevc_qpel_uni_v24_8_c: 2443.4 >put_hevc_qpel_uni_v24_8_neon: 253.9 >put_hevc_qpel_uni_v32_8_c: 2328.6 >put_hevc_qpel_uni_v32_8_neon: 311.4 >put_hevc_qpel_uni_v48_8_c: 4856.9 >put_hevc_qpel_uni_v48_8_neon: 692.6 >put_hevc_qpel_uni_v64_8_c: 8169.9 >put_hevc_qpel_uni_v64_8_neon: 1203.4 >put_hevc_qpel_v4_8_c: 123.6 >put_hevc_qpel_v4_8_neon: 26.1 >put_hevc_qpel_v6_8_c: 259.9 >put_hevc_qpel_v6_8_neon: 22.6 >put_hevc_qpel_v8_8_c: 197.4 >put_hevc_qpel_v8_8_neon: 24.9 >put_hevc_qpel_v12_8_c: 561.4 >put_hevc_qpel_v12_8_neon: 53.6 >put_hevc_qpel_v16_8_c: 474.9 >put_hevc_qpel_v16_8_neon: 75.4 >put_hevc_qpel_v24_8_c: 799.9 >put_hevc_qpel_v24_8_neon: 159.1 >put_hevc_qpel_v32_8_c: 1214.1 >put_hevc_qpel_v32_8_neon: 267.9 >put_hevc_qpel_v48_8_c: 2217.6 >put_hevc_qpel_v48_8_neon: 639.1 >put_hevc_qpel_v64_8_c: 3495.4 >put_hevc_qpel_v64_8_neon: 1081.1 > >Signed-off-by: Josh Dekker <j...@itanimul.li> >--- > libavcodec/aarch64/Makefile | 4 +- > libavcodec/aarch64/hevcdsp_epel_neon.S | 3931 ++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 118 + > libavcodec/aarch64/hevcdsp_qpel_neon.S | 5646 +++++++++++++++++++++ > 4 files changed, 9698 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S > create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S > >diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile >index 954461f81d..ebedc03bfa 100644 >--- a/libavcodec/aarch64/Makefile >+++ b/libavcodec/aarch64/Makefile >@@ -61,6 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += >aarch64/vp9itxfm_16bpp_neon.o \ > aarch64/vp9lpf_neon.o > \ > aarch64/vp9mc_16bpp_neon.o > \ > aarch64/vp9mc_neon.o >-NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o > \ >+NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_epel_neon.o > \ >+ aarch64/hevcdsp_idct_neon.o > \ > aarch64/hevcdsp_init_aarch64.o > \ >+ aarch64/hevcdsp_qpel_neon.o > \ > aarch64/hevcdsp_sao_neon.o >diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S >b/libavcodec/aarch64/hevcdsp_epel_neon.S >new file mode 100644 >index 0000000000..0366fe8ae3 >--- /dev/null >+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S >@@ -0,0 +1,3931 @@ >+/* -*-arm64-*- >+ * vim: syntax=arm64asm >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >USA >+ */ >+ >+#include "libavutil/aarch64/asm.S" >+#define MAX_PB_SIZE 64 >+ >+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2)
>+1: ld1 {v0.s}[0], [x1], x2 [MC] I haven't M1, so I am not sure how about this instruction. However, in A57 doc, LD1 latency=8, throughput=1, the LD1R latency=5, throughput=1 Moreover, I guess all of interpolate function works on even rows, so we can unroll a little. Further, we may insert SUB in between LD/ST to avoid pipeline stall, and CBNZ avoid affect flags register >+ ushll v4.8h, v0.8b, #6 >+ st1 {v4.d}[0], [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2 - 8) >+1: ld1 {v0.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ st1 {v4.d}[0], [x0], #8 >+ st1 {v4.s}[2], [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ st1 {v4.8h}, [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2 - 16) >+1: ld1 {v0.8b, v1.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ st1 {v4.8h}, [x0], #16 >+ ushll v5.8h, v1.8b, #6 >+ st1 {v5.d}[0], [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b, v1.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ ushll v5.8h, v1.8b, #6 >+ st1 {v4.8h, v5.8h}, [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b, v1.8b, v2.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ ushll v5.8h, v1.8b, #6 >+ ushll v6.8h, v2.8b, #6 >+ st1 {v4.8h, v5.8h, v6.8h}, [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ ushll v5.8h, v1.8b, #6 >+ ushll v6.8h, v2.8b, #6 >+ ushll v7.8h, v3.8b, #6 >+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1 >+ mov x7, #(MAX_PB_SIZE) >+1: ld1 {v0.16b, v1.16b, v2.16b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ ushll2 v5.8h, v0.16b, #6 >+ ushll v6.8h, v1.8b, #6 >+ ushll2 v7.8h, v1.16b, #6 >+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 >+ ushll v4.8h, v2.8b, #6 >+ ushll2 v5.8h, v2.16b, #6 >+ st1 {v4.8h, v5.8h}, [x0], x7 >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 >+1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2 >+ ushll v4.8h, v0.8b, #6 >+ ushll2 v5.8h, v0.16b, #6 >+ ushll v6.8h, v1.8b, #6 >+ ushll2 v7.8h, v1.16b, #6 >+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE) >+ ushll v4.8h, v2.8b, #6 >+ ushll2 v5.8h, v2.16b, #6 >+ ushll v6.8h, v3.8b, #6 >+ ushll2 v7.8h, v3.16b, #6 >+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE) >+ subs x3, x3, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.s}[0], [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ld1 {v20.4h}, [x4], x10 // src2 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ st1 {v0.s}[0], [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2 - 8) >+ sub x1, x1, #4 >+1: ld1 {v0.8b}, [x2], x3 >+ ushll v16.8h, v0.8b, #6 >+ ld1 {v20.4h}, [x4], #8 >+ ld1 {v20.s}[2], [x4], x10 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ st1 {v0.s}[0], [x0], #4 >+ st1 {v0.h}[2], [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b}, [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ld1 {v20.8h}, [x4], x10 // src2 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ st1 {v0.8b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2 - 16) >+ sub x1, x1, #8 >+1: ld1 {v0.16b}, [x2], x3 >+ ushll v16.8h, v0.8b, #6 >+ ushll2 v17.8h, v0.16b, #6 >+ ld1 {v20.8h}, [x4], #16 >+ ld1 {v21.4h}, [x4], x10 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqadd v17.8h, v17.8h, v21.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun2 v0.16b, v17.8h, #7 >+ st1 {v0.8b}, [x0], #8 >+ st1 {v0.s}[2], [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.16b}, [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ushll2 v17.8h, v0.16b, #6 >+ ld1 {v20.8h, v21.8h}, [x4], x10 // src2 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqadd v17.8h, v17.8h, v21.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun2 v0.16b, v17.8h, #7 >+ st1 {v0.16b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ushll v17.8h, v1.8b, #6 >+ ushll v18.8h, v2.8b, #6 >+ ld1 {v20.8h, v21.8h, v22.8h}, [x4], x10 // src2 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqadd v17.8h, v17.8h, v21.8h >+ sqadd v18.8h, v18.8h, v22.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun v1.8b, v17.8h, #7 >+ sqrshrun v2.8b, v18.8h, #7 >+ st1 {v0.8b, v1.8b, v2.8b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v0.16b, v1.16b}, [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ushll2 v17.8h, v0.16b, #6 >+ ushll v18.8h, v1.8b, #6 >+ ushll2 v19.8h, v1.16b, #6 >+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], x10 // src2 >+ sqadd v16.8h, v16.8h, v20.8h >+ sqadd v17.8h, v17.8h, v21.8h >+ sqadd v18.8h, v18.8h, v22.8h >+ sqadd v19.8h, v19.8h, v23.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun2 v0.16b, v17.8h, #7 >+ sqrshrun v1.8b, v18.8h, #7 >+ sqrshrun2 v1.16b, v19.8h, #7 >+ st1 {v0.16b, v1.16b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1 >+ mov x10, #(MAX_PB_SIZE) >+1: ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3 // src >+ ushll v16.8h, v0.8b, #6 >+ ushll2 v17.8h, v0.16b, #6 >+ ushll v18.8h, v1.8b, #6 >+ ushll2 v19.8h, v1.16b, #6 >+ ushll v20.8h, v2.8b, #6 >+ ushll2 v21.8h, v2.16b, #6 >+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) > // src2 >+ sqadd v16.8h, v16.8h, v24.8h >+ sqadd v17.8h, v17.8h, v25.8h >+ sqadd v18.8h, v18.8h, v26.8h >+ sqadd v19.8h, v19.8h, v27.8h >+ ld1 {v24.8h, v25.8h}, [x4], x10 >+ sqadd v20.8h, v20.8h, v24.8h >+ sqadd v21.8h, v21.8h, v25.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun2 v0.16b, v17.8h, #7 >+ sqrshrun v1.8b, v18.8h, #7 >+ sqrshrun2 v1.16b, v19.8h, #7 >+ sqrshrun v2.8b, v20.8h, #7 >+ sqrshrun2 v2.16b, v21.8h, #7 >+ st1 {v0.16b, v1.16b, v2.16b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1 >+1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 >// src >+ ushll v16.8h, v0.8b, #6 >+ ushll2 v17.8h, v0.16b, #6 >+ ushll v18.8h, v1.8b, #6 >+ ushll2 v19.8h, v1.16b, #6 >+ ushll v20.8h, v2.8b, #6 >+ ushll2 v21.8h, v2.16b, #6 >+ ushll v22.8h, v3.8b, #6 >+ ushll2 v23.8h, v3.16b, #6 >+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) > // src2 >+ sqadd v16.8h, v16.8h, v24.8h >+ sqadd v17.8h, v17.8h, v25.8h >+ sqadd v18.8h, v18.8h, v26.8h >+ sqadd v19.8h, v19.8h, v27.8h >+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) >+ sqadd v20.8h, v20.8h, v24.8h >+ sqadd v21.8h, v21.8h, v25.8h >+ sqadd v22.8h, v22.8h, v26.8h >+ sqadd v23.8h, v23.8h, v27.8h >+ sqrshrun v0.8b, v16.8h, #7 >+ sqrshrun2 v0.16b, v17.8h, #7 >+ sqrshrun v1.8b, v18.8h, #7 >+ sqrshrun2 v1.16b, v19.8h, #7 >+ sqrshrun v2.8b, v20.8h, #7 >+ sqrshrun2 v2.16b, v21.8h, #7 >+ sqrshrun v3.8b, v22.8h, #7 >+ sqrshrun2 v3.16b, v23.8h, #7 >+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 >+ subs x5, x5, #1 >+ b.ne 1b >+ ret >+endfunc >+ >+.Lepel_filters: >+ .byte 0, 0, 0, 0 >+ .byte -2, 58, 10, -2 >+ .byte -4, 54, 16, -2 >+ .byte -6, 46, 28, -4 >+ .byte -4, 36, 36, -4 >+ .byte -4, 28, 46, -6 >+ .byte -2, 16, 54, -4 >+ .byte -2, 10, 58, -2 >+ >+.macro load_epel_filterb freg, xreg >+ adr \xreg, .Lepel_filters >+ add \xreg, \xreg, \freg, lsl #2 >+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter >+ neg v0.16b, v0.16b >+ neg v3.16b, v3.16b [MC] Why not put abs(x) in the constant table? >+.endm >+ >+.macro calc_epelb dst, src1, src2, src3, src4 >+ umlsl \dst\().8h, \src1\().8b, v0.8b >+ umlal \dst\().8h, \src2\().8b, v1.8b >+ umlal \dst\().8h, \src3\().8b, v2.8b >+ umlsl \dst\().8h, \src4\().8b, v3.8b >+.endm >+ >+.macro calc_epelb2 dst, src1, src2, src3, src4 >+ umlsl2 \dst\().8h, \src1\().16b, v0.16b >+ umlal2 \dst\().8h, \src2\().16b, v1.16b >+ umlal2 \dst\().8h, \src3\().16b, v2.16b >+ umlsl2 \dst\().8h, \src4\().16b, v3.16b >+.endm >+ >+.macro load_epel_filterh freg, xreg >+ adr \xreg, .Lepel_filters >+ add \xreg, \xreg, \freg, lsl #2 >+ ld1 {v0.8b}, [\xreg] >+ sxtl v0.8h, v0.8b >+.endm >+ >+.macro calc_epelh dst, src1, src2, src3, src4 >+ smull \dst\().4s, \src1\().4h, v0.h[0] >+ smlal \dst\().4s, \src2\().4h, v0.h[1] >+ smlal \dst\().4s, \src3\().4h, v0.h[2] >+ smlal \dst\().4s, \src4\().4h, v0.h[3] >+ sqshrn \dst\().4h, \dst\().4s, #6 >+.endm >+ >+.macro calc_epelh2 dst, tmp, src1, src2, src3, src4 >+ smull2 \tmp\().4s, \src1\().8h, v0.h[0] >+ smlal2 \tmp\().4s, \src2\().8h, v0.h[1] >+ smlal2 \tmp\().4s, \src3\().8h, v0.h[2] >+ smlal2 \tmp\().4s, \src4\().8h, v0.h[3] >+ sqshrn2 \dst\().8h, \tmp\().4s, #6 >+.endm >+ >+function ff_hevc_put_hevc_epel_h4_8_neon, export=1 >+ load_epel_filterb x4, x5 >+ sub x1, x1, #1 >+ mov x10, #(MAX_PB_SIZE * 2) >+1: ld1 {v4.8b}, [x1], x2 >+ ushr v5.2d, v4.2d, #8 >+ ushr v6.2d, v5.2d, #8 >+ ushr v7.2d, v6.2d, #8 >+ movi v16.8h, #0 >+ calc_epelb v16, v4, v5, v6, v7 >+ st1 {v16.4h}, [x0], x10 >+ subs x3, x3, #1 // height >+ b.ne 1b >+ ret >+endfunc >+ >+function ff_hevc_put_hevc_epel_h6_8_neon, export=1 >+ load_epel_filterb x4, x5 >+ sub x1, x1, #1 >+ sub x2, x2, #8 >+ mov x10, #(MAX_PB_SIZE * 2 - 8) >+1: ld1 {v24.8b}, [x1], #8 >+ ushr v26.2d, v24.2d, #8 >+ ushr v27.2d, v26.2d, #8 >+ ushr v28.2d, v27.2d, #8 [MC] Dependency link will made pipeline stall, how about EXT or LD1 directly? >+ movi v16.8h, #0 >+ ld1 {v28.b}[5], [x1], x2 >+ calc_epelb v16, v24, v26, v27, v28 >+ st1 {v16.4h}, [x0], #8 >+ st1 {v16.s}[2], [x0], x10 >+ subs x3, x3, #1 // height >+ b.ne 1b >+ ret >+endfunc >+ ... >-- >2.30.1 (Apple Git-130) > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".