-rw-r--r-- | audio_utils/benchmarks/biquad_filter_benchmark.cpp | 462 | ||||
-rw-r--r-- | audio_utils/include/audio_utils/BiquadFilter.h | 728 | ||||
-rw-r--r-- | audio_utils/include/audio_utils/intrinsic_utils.h | 197 | ||||
-rw-r--r-- | audio_utils/tests/biquad_filter_tests.cpp | 20 | ||||
-rw-r--r-- | audio_utils/tests/intrinsic_tests.cpp | 14 |
diff --git a/audio_utils/benchmarks/biquad_filter_benchmark.cpp b/audio_utils/benchmarks/biquad_filter_benchmark.cpp index 0a61593..e912b18 100644 --- a/audio_utils/benchmarks/biquad_filter_benchmark.cpp +++ b/audio_utils/benchmarks/biquad_filter_benchmark.cpp @@ -25,6 +25,8 @@ #include <audio_utils/BiquadFilter.h> #include <audio_utils/format.h> +#pragma GCC diagnostic ignored "-Wunused-function" // we use override array assignment + static constexpr size_t DATA_SIZE = 1024; // The coefficients is a HPF with sampling frequency as 48000, center frequency as 600, // and Q as 0.707. As all the coefficients are not zero, they can be used to benchmark @@ -82,138 +84,327 @@ static void BiquadFilter1DArgs(benchmark::internal::Benchmark* b) { BENCHMARK(BM_BiquadFilter1D)->Apply(BiquadFilter1DArgs); /******************************************************************* - * A test result running on Pixel 4 for comparison. - * The first parameter indicates the input data is subnormal or not. - * 0 for normal input data, 1 for subnormal input data. - * The second parameter indicates the channel count. - * The third parameter indicates the occupancy of the coefficients. - * ----------------------------------------------------------------- - * Benchmark Time CPU Iterations - * ----------------------------------------------------------------- - * BM_BiquadFilter/0/1/1 734 ns 732 ns 740671 - * BM_BiquadFilter/0/1/2 554 ns 553 ns 1266836 - * BM_BiquadFilter/0/1/3 647 ns 645 ns 1085314 - * BM_BiquadFilter/0/1/4 834 ns 832 ns 841345 - * BM_BiquadFilter/0/1/5 1068 ns 1065 ns 657343 - * BM_BiquadFilter/0/1/6 767 ns 765 ns 915223 - * BM_BiquadFilter/0/1/7 929 ns 926 ns 756405 - * BM_BiquadFilter/0/1/8 2173 ns 2168 ns 322949 - * BM_BiquadFilter/0/1/9 1831 ns 1826 ns 383304 - * BM_BiquadFilter/0/1/10 1931 ns 1927 ns 363386 - * BM_BiquadFilter/0/1/11 2536 ns 2529 ns 276783 - * BM_BiquadFilter/0/1/12 2535 ns 2529 ns 276826 - * BM_BiquadFilter/0/1/13 2174 ns 2169 ns 322755 - * BM_BiquadFilter/0/1/14 3257 ns 3250 ns 215383 - * BM_BiquadFilter/0/1/15 2175 ns 2169 ns 322711 - * BM_BiquadFilter/0/1/16 1494 ns 1491 ns 469625 - * BM_BiquadFilter/0/1/17 2176 ns 2171 ns 322423 - * BM_BiquadFilter/0/1/18 1568 ns 1564 ns 447467 - * BM_BiquadFilter/0/1/19 1325 ns 1322 ns 529762 - * BM_BiquadFilter/0/1/20 1926 ns 1921 ns 364534 - * BM_BiquadFilter/0/1/21 2630 ns 2623 ns 266027 - * BM_BiquadFilter/0/1/22 1998 ns 1993 ns 351210 - * BM_BiquadFilter/0/1/23 1882 ns 1877 ns 373028 - * BM_BiquadFilter/0/1/24 2536 ns 2529 ns 276818 - * BM_BiquadFilter/0/1/25 2176 ns 2170 ns 322627 - * BM_BiquadFilter/0/1/26 3258 ns 3250 ns 215440 - * BM_BiquadFilter/0/1/27 2175 ns 2169 ns 322724 - * BM_BiquadFilter/0/1/28 2535 ns 2529 ns 276823 - * BM_BiquadFilter/0/1/29 2175 ns 2170 ns 322560 - * BM_BiquadFilter/0/1/30 3259 ns 3250 ns 215354 - * BM_BiquadFilter/0/1/31 2176 ns 2171 ns 322492 - * BM_BiquadFilter/0/2/1 1470 ns 1466 ns 477411 - * BM_BiquadFilter/0/2/2 1109 ns 1107 ns 632666 - * BM_BiquadFilter/0/2/3 1297 ns 1294 ns 540804 - * BM_BiquadFilter/0/2/4 1681 ns 1677 ns 417675 - * BM_BiquadFilter/0/2/5 2137 ns 2132 ns 328721 - * BM_BiquadFilter/0/2/6 1572 ns 1569 ns 447114 - * BM_BiquadFilter/0/2/7 1736 ns 1732 ns 385563 - * BM_BiquadFilter/0/2/8 4331 ns 4320 ns 162048 - * BM_BiquadFilter/0/2/9 3795 ns 3785 ns 184166 - * BM_BiquadFilter/0/2/10 3832 ns 3823 ns 183015 - * BM_BiquadFilter/0/2/11 5060 ns 5047 ns 138660 - * BM_BiquadFilter/0/2/12 5046 ns 5034 ns 139033 - * BM_BiquadFilter/0/2/13 4333 ns 4322 ns 161952 - * BM_BiquadFilter/0/2/14 6500 ns 6482 ns 108022 - * BM_BiquadFilter/0/2/15 4335 ns 4324 ns 161915 - * BM_BiquadFilter/0/2/16 2965 ns 2957 ns 236771 - * BM_BiquadFilter/0/2/17 4368 ns 4358 ns 160829 - * BM_BiquadFilter/0/2/18 3193 ns 3186 ns 219766 - * BM_BiquadFilter/0/2/19 2804 ns 2798 ns 250201 - * BM_BiquadFilter/0/2/20 3839 ns 3830 ns 182731 - * BM_BiquadFilter/0/2/21 5310 ns 5296 ns 133012 - * BM_BiquadFilter/0/2/22 3995 ns 3984 ns 175672 - * BM_BiquadFilter/0/2/23 3755 ns 3745 ns 186960 - * BM_BiquadFilter/0/2/24 5060 ns 5045 ns 138733 - * BM_BiquadFilter/0/2/25 4343 ns 4330 ns 161632 - * BM_BiquadFilter/0/2/26 6505 ns 6489 ns 107871 - * BM_BiquadFilter/0/2/27 4348 ns 4336 ns 161436 - * BM_BiquadFilter/0/2/28 5068 ns 5054 ns 138515 - * BM_BiquadFilter/0/2/29 4401 ns 4389 ns 158834 - * BM_BiquadFilter/0/2/30 6514 ns 6497 ns 107752 - * BM_BiquadFilter/0/2/31 4352 ns 4341 ns 161242 - * BM_BiquadFilter/1/1/1 734 ns 732 ns 955891 - * BM_BiquadFilter/1/1/2 554 ns 552 ns 1267096 - * BM_BiquadFilter/1/1/3 647 ns 645 ns 1084919 - * BM_BiquadFilter/1/1/4 834 ns 832 ns 841505 - * BM_BiquadFilter/1/1/5 1068 ns 1065 ns 657299 - * BM_BiquadFilter/1/1/6 767 ns 765 ns 915192 - * BM_BiquadFilter/1/1/7 927 ns 924 ns 761606 - * BM_BiquadFilter/1/1/8 2174 ns 2168 ns 322888 - * BM_BiquadFilter/1/1/9 1832 ns 1826 ns 383311 - * BM_BiquadFilter/1/1/10 1932 ns 1926 ns 363384 - * BM_BiquadFilter/1/1/11 2536 ns 2529 ns 276796 - * BM_BiquadFilter/1/1/12 2536 ns 2529 ns 276843 - * BM_BiquadFilter/1/1/13 2175 ns 2169 ns 322743 - * BM_BiquadFilter/1/1/14 3259 ns 3250 ns 215420 - * BM_BiquadFilter/1/1/15 2175 ns 2169 ns 322745 - * BM_BiquadFilter/1/1/16 1495 ns 1491 ns 469661 - * BM_BiquadFilter/1/1/17 2177 ns 2171 ns 322388 - * BM_BiquadFilter/1/1/18 1569 ns 1564 ns 447468 - * BM_BiquadFilter/1/1/19 1325 ns 1322 ns 529736 - * BM_BiquadFilter/1/1/20 1927 ns 1922 ns 363962 - * BM_BiquadFilter/1/1/21 2624 ns 2617 ns 267102 - * BM_BiquadFilter/1/1/22 1999 ns 1993 ns 351169 - * BM_BiquadFilter/1/1/23 1882 ns 1877 ns 372944 - * BM_BiquadFilter/1/1/24 2536 ns 2529 ns 276751 - * BM_BiquadFilter/1/1/25 2176 ns 2170 ns 322560 - * BM_BiquadFilter/1/1/26 3259 ns 3250 ns 215389 - * BM_BiquadFilter/1/1/27 2175 ns 2169 ns 322640 - * BM_BiquadFilter/1/1/28 2536 ns 2529 ns 276786 - * BM_BiquadFilter/1/1/29 2176 ns 2170 ns 322533 - * BM_BiquadFilter/1/1/30 3260 ns 3251 ns 215325 - * BM_BiquadFilter/1/1/31 2177 ns 2171 ns 322425 - * BM_BiquadFilter/1/2/1 1471 ns 1467 ns 477222 - * BM_BiquadFilter/1/2/2 1109 ns 1107 ns 632565 - * BM_BiquadFilter/1/2/3 1298 ns 1294 ns 541051 - * BM_BiquadFilter/1/2/4 1682 ns 1678 ns 417354 - * BM_BiquadFilter/1/2/5 2136 ns 2129 ns 328967 - * BM_BiquadFilter/1/2/6 1572 ns 1568 ns 446095 - * BM_BiquadFilter/1/2/7 1792 ns 1788 ns 399598 - * BM_BiquadFilter/1/2/8 4333 ns 4320 ns 162032 - * BM_BiquadFilter/1/2/9 3807 ns 3797 ns 184097 - * BM_BiquadFilter/1/2/10 3842 ns 3831 ns 182711 - * BM_BiquadFilter/1/2/11 5062 ns 5048 ns 138636 - * BM_BiquadFilter/1/2/12 5050 ns 5036 ns 139031 - * BM_BiquadFilter/1/2/13 4335 ns 4323 ns 161943 - * BM_BiquadFilter/1/2/14 6500 ns 6483 ns 108020 - * BM_BiquadFilter/1/2/15 4336 ns 4324 ns 161873 - * BM_BiquadFilter/1/2/16 2966 ns 2957 ns 236709 - * BM_BiquadFilter/1/2/17 4359 ns 4348 ns 160581 - * BM_BiquadFilter/1/2/18 3196 ns 3187 ns 219532 - * BM_BiquadFilter/1/2/19 2805 ns 2798 ns 250157 - * BM_BiquadFilter/1/2/20 3839 ns 3829 ns 182628 - * BM_BiquadFilter/1/2/21 5291 ns 5276 ns 131153 - * BM_BiquadFilter/1/2/22 3994 ns 3984 ns 175699 - * BM_BiquadFilter/1/2/23 3757 ns 3747 ns 186864 - * BM_BiquadFilter/1/2/24 5060 ns 5046 ns 138754 - * BM_BiquadFilter/1/2/25 4343 ns 4331 ns 161614 - * BM_BiquadFilter/1/2/26 6512 ns 6491 ns 107852 - * BM_BiquadFilter/1/2/27 4348 ns 4336 ns 161419 - * BM_BiquadFilter/1/2/28 5068 ns 5055 ns 138481 - * BM_BiquadFilter/1/2/29 4386 ns 4374 ns 159635 - * BM_BiquadFilter/1/2/30 6514 ns 6498 ns 107752 - * BM_BiquadFilter/1/2/31 4353 ns 4342 ns 161227 + A test result running on Pixel 4XL for comparison. + + Parameterized Test BM_BiquadFilter1D/A + <A> is 0 or 1 indicating if the input data is subnormal or not. + + Parameterized Test BM_BiquadFilter<TYPE>/A/B/C + <A> is 0 or 1 indicating if the input data is subnormal or not. + <B> is the channel count, starting from 1 + <C> indicates the occupancy of the coefficients as a bitmask (1 - 31) representing + b0, b1, b2, a0, a1. 31 indicates all Biquad coefficients are non-zero. + +----------------------------------------------------------------------------------- +Benchmark Time CPU Iterations +----------------------------------------------------------------------------------- +BM_BiquadFilter1D/0 559 ns 558 ns 1254596 +BM_BiquadFilter1D/1 563 ns 561 ns 1246670 +BM_BiquadFilterFloatOptimized/0/1/31 2263 ns 2257 ns 310326 +BM_BiquadFilterFloatOptimized/0/2/31 2471 ns 2465 ns 289044 +BM_BiquadFilterFloatOptimized/0/3/31 2827 ns 2818 ns 248148 +BM_BiquadFilterFloatOptimized/0/4/31 2443 ns 2436 ns 287402 +BM_BiquadFilterFloatOptimized/0/5/31 2843 ns 2836 ns 246912 +BM_BiquadFilterFloatOptimized/0/6/31 3188 ns 3180 ns 220494 +BM_BiquadFilterFloatOptimized/0/7/31 3910 ns 3898 ns 179652 +BM_BiquadFilterFloatOptimized/0/8/31 3167 ns 3157 ns 221764 +BM_BiquadFilterFloatOptimized/0/9/31 3947 ns 3936 ns 177855 +BM_BiquadFilterFloatOptimized/0/10/31 4204 ns 4193 ns 166963 +BM_BiquadFilterFloatOptimized/0/11/31 5313 ns 5298 ns 131865 +BM_BiquadFilterFloatOptimized/0/12/31 4205 ns 4193 ns 166925 +BM_BiquadFilterFloatOptimized/0/13/31 5303 ns 5289 ns 132321 +BM_BiquadFilterFloatOptimized/0/14/31 5501 ns 5487 ns 127623 +BM_BiquadFilterFloatOptimized/0/15/31 6572 ns 6551 ns 106827 +BM_BiquadFilterFloatOptimized/0/16/31 5438 ns 5426 ns 129016 +BM_BiquadFilterFloatOptimized/0/17/31 7729 ns 7708 ns 90812 +BM_BiquadFilterFloatOptimized/0/18/31 8007 ns 7984 ns 87910 +BM_BiquadFilterFloatOptimized/0/19/31 8586 ns 8560 ns 81801 +BM_BiquadFilterFloatOptimized/0/20/31 7811 ns 7789 ns 89871 +BM_BiquadFilterFloatOptimized/0/21/31 8714 ns 8691 ns 80518 +BM_BiquadFilterFloatOptimized/0/22/31 8851 ns 8820 ns 79287 +BM_BiquadFilterFloatOptimized/0/23/31 9606 ns 9576 ns 73108 +BM_BiquadFilterFloatOptimized/0/24/31 8676 ns 8647 ns 80944 +BM_BiquadFilterFloatOptimized/0/1/1 559 ns 557 ns 1255379 +BM_BiquadFilterFloatOptimized/0/1/2 649 ns 647 ns 1081153 +BM_BiquadFilterFloatOptimized/0/1/3 649 ns 647 ns 1081862 +BM_BiquadFilterFloatOptimized/0/1/4 848 ns 846 ns 827513 +BM_BiquadFilterFloatOptimized/0/1/5 847 ns 844 ns 829861 +BM_BiquadFilterFloatOptimized/0/1/6 844 ns 841 ns 829051 +BM_BiquadFilterFloatOptimized/0/1/7 848 ns 845 ns 827492 +BM_BiquadFilterFloatOptimized/0/1/8 2289 ns 2282 ns 307077 +BM_BiquadFilterFloatOptimized/0/1/9 2296 ns 2288 ns 303532 +BM_BiquadFilterFloatOptimized/0/1/10 2305 ns 2299 ns 304972 +BM_BiquadFilterFloatOptimized/0/1/11 2287 ns 2281 ns 307199 +BM_BiquadFilterFloatOptimized/0/1/12 2262 ns 2256 ns 310300 +BM_BiquadFilterFloatOptimized/0/1/13 2263 ns 2257 ns 310400 +BM_BiquadFilterFloatOptimized/0/1/14 2262 ns 2257 ns 310236 +BM_BiquadFilterFloatOptimized/0/1/15 2262 ns 2256 ns 309767 +BM_BiquadFilterFloatOptimized/0/1/16 2262 ns 2256 ns 310116 +BM_BiquadFilterFloatOptimized/0/1/17 2262 ns 2256 ns 310345 +BM_BiquadFilterFloatOptimized/0/1/18 2261 ns 2255 ns 310106 +BM_BiquadFilterFloatOptimized/0/1/19 2263 ns 2256 ns 310242 +BM_BiquadFilterFloatOptimized/0/1/20 2262 ns 2255 ns 310131 +BM_BiquadFilterFloatOptimized/0/1/21 2262 ns 2256 ns 310250 +BM_BiquadFilterFloatOptimized/0/1/22 2264 ns 2257 ns 310329 +BM_BiquadFilterFloatOptimized/0/1/23 2264 ns 2257 ns 310011 +BM_BiquadFilterFloatOptimized/0/1/24 2264 ns 2258 ns 310008 +BM_BiquadFilterFloatOptimized/0/1/25 2263 ns 2256 ns 310203 +BM_BiquadFilterFloatOptimized/0/1/26 2264 ns 2258 ns 310016 +BM_BiquadFilterFloatOptimized/0/1/27 2264 ns 2258 ns 310046 +BM_BiquadFilterFloatOptimized/0/1/28 2263 ns 2257 ns 310325 +BM_BiquadFilterFloatOptimized/0/1/29 2264 ns 2257 ns 310048 +BM_BiquadFilterFloatOptimized/0/1/30 2265 ns 2258 ns 310008 +BM_BiquadFilterFloatOptimized/0/1/31 2263 ns 2257 ns 310046 +BM_BiquadFilterFloatOptimized/0/2/1 613 ns 611 ns 1143690 +BM_BiquadFilterFloatOptimized/0/2/2 801 ns 799 ns 877788 +BM_BiquadFilterFloatOptimized/0/2/3 799 ns 797 ns 875867 +BM_BiquadFilterFloatOptimized/0/2/4 1507 ns 1502 ns 465824 +BM_BiquadFilterFloatOptimized/0/2/5 1505 ns 1501 ns 466685 +BM_BiquadFilterFloatOptimized/0/2/6 1500 ns 1496 ns 468398 +BM_BiquadFilterFloatOptimized/0/2/7 1505 ns 1501 ns 466086 +BM_BiquadFilterFloatOptimized/0/2/8 2237 ns 2231 ns 313969 +BM_BiquadFilterFloatOptimized/0/2/9 2233 ns 2227 ns 314388 +BM_BiquadFilterFloatOptimized/0/2/10 2234 ns 2228 ns 314181 +BM_BiquadFilterFloatOptimized/0/2/11 2236 ns 2230 ns 313480 +BM_BiquadFilterFloatOptimized/0/2/12 2474 ns 2467 ns 287311 +BM_BiquadFilterFloatOptimized/0/2/13 2448 ns 2441 ns 284113 +BM_BiquadFilterFloatOptimized/0/2/14 2512 ns 2504 ns 283932 +BM_BiquadFilterFloatOptimized/0/2/15 2448 ns 2442 ns 285294 +BM_BiquadFilterFloatOptimized/0/2/16 2459 ns 2451 ns 283455 +BM_BiquadFilterFloatOptimized/0/2/17 2466 ns 2459 ns 282706 +BM_BiquadFilterFloatOptimized/0/2/18 2493 ns 2486 ns 279019 +BM_BiquadFilterFloatOptimized/0/2/19 2453 ns 2446 ns 285647 +BM_BiquadFilterFloatOptimized/0/2/20 2475 ns 2468 ns 285345 +BM_BiquadFilterFloatOptimized/0/2/21 2506 ns 2499 ns 286030 +BM_BiquadFilterFloatOptimized/0/2/22 2512 ns 2505 ns 280230 +BM_BiquadFilterFloatOptimized/0/2/23 2466 ns 2458 ns 284371 +BM_BiquadFilterFloatOptimized/0/2/24 2454 ns 2446 ns 286968 +BM_BiquadFilterFloatOptimized/0/2/25 2478 ns 2470 ns 284297 +BM_BiquadFilterFloatOptimized/0/2/26 2509 ns 2501 ns 281390 +BM_BiquadFilterFloatOptimized/0/2/27 2477 ns 2469 ns 282065 +BM_BiquadFilterFloatOptimized/0/2/28 2484 ns 2476 ns 284814 +BM_BiquadFilterFloatOptimized/0/2/29 2443 ns 2435 ns 283155 +BM_BiquadFilterFloatOptimized/0/2/30 2488 ns 2481 ns 283433 +BM_BiquadFilterFloatOptimized/0/2/31 2452 ns 2444 ns 286627 +BM_BiquadFilterFloatOptimized/0/3/1 1015 ns 1012 ns 691448 +BM_BiquadFilterFloatOptimized/0/3/2 1362 ns 1357 ns 516073 +BM_BiquadFilterFloatOptimized/0/3/3 1358 ns 1354 ns 516419 +BM_BiquadFilterFloatOptimized/0/3/4 2305 ns 2298 ns 304561 +BM_BiquadFilterFloatOptimized/0/3/5 2303 ns 2296 ns 304712 +BM_BiquadFilterFloatOptimized/0/3/6 2292 ns 2285 ns 306149 +BM_BiquadFilterFloatOptimized/0/3/7 2304 ns 2297 ns 304959 +BM_BiquadFilterFloatOptimized/0/3/8 2318 ns 2311 ns 301399 +BM_BiquadFilterFloatOptimized/0/3/9 2310 ns 2303 ns 303435 +BM_BiquadFilterFloatOptimized/0/3/10 2313 ns 2306 ns 303399 +BM_BiquadFilterFloatOptimized/0/3/11 2318 ns 2311 ns 302755 +BM_BiquadFilterFloatOptimized/0/3/12 2831 ns 2823 ns 247948 +BM_BiquadFilterFloatOptimized/0/3/13 2830 ns 2822 ns 248677 +BM_BiquadFilterFloatOptimized/0/3/14 2850 ns 2842 ns 246677 +BM_BiquadFilterFloatOptimized/0/3/15 2827 ns 2819 ns 248340 +BM_BiquadFilterFloatOptimized/0/3/16 2834 ns 2825 ns 247686 +BM_BiquadFilterFloatOptimized/0/3/17 2828 ns 2819 ns 247784 +BM_BiquadFilterFloatOptimized/0/3/18 2842 ns 2833 ns 248007 +BM_BiquadFilterFloatOptimized/0/3/19 2843 ns 2834 ns 248472 +BM_BiquadFilterFloatOptimized/0/3/20 2829 ns 2820 ns 248677 +BM_BiquadFilterFloatOptimized/0/3/21 2833 ns 2824 ns 248061 +BM_BiquadFilterFloatOptimized/0/3/22 2849 ns 2841 ns 246611 +BM_BiquadFilterFloatOptimized/0/3/23 2827 ns 2818 ns 247863 +BM_BiquadFilterFloatOptimized/0/3/24 2830 ns 2820 ns 247550 +BM_BiquadFilterFloatOptimized/0/3/25 2836 ns 2827 ns 248435 +BM_BiquadFilterFloatOptimized/0/3/26 2859 ns 2850 ns 245727 +BM_BiquadFilterFloatOptimized/0/3/27 2828 ns 2818 ns 247914 +BM_BiquadFilterFloatOptimized/0/3/28 2818 ns 2809 ns 248637 +BM_BiquadFilterFloatOptimized/0/3/29 2820 ns 2810 ns 249170 +BM_BiquadFilterFloatOptimized/0/3/30 2858 ns 2849 ns 245760 +BM_BiquadFilterFloatOptimized/0/3/31 2832 ns 2823 ns 247969 +BM_BiquadFilterFloatOptimized/0/4/1 650 ns 648 ns 1080613 +BM_BiquadFilterFloatOptimized/0/4/2 808 ns 805 ns 868597 +BM_BiquadFilterFloatOptimized/0/4/3 809 ns 807 ns 867460 +BM_BiquadFilterFloatOptimized/0/4/4 844 ns 841 ns 831865 +BM_BiquadFilterFloatOptimized/0/4/5 845 ns 842 ns 833152 +BM_BiquadFilterFloatOptimized/0/4/6 844 ns 842 ns 831276 +BM_BiquadFilterFloatOptimized/0/4/7 843 ns 841 ns 831231 +BM_BiquadFilterFloatOptimized/0/4/8 2193 ns 2186 ns 320234 +BM_BiquadFilterFloatOptimized/0/4/9 2194 ns 2187 ns 320273 +BM_BiquadFilterFloatOptimized/0/4/10 2193 ns 2185 ns 320448 +BM_BiquadFilterFloatOptimized/0/4/11 2192 ns 2185 ns 320000 +BM_BiquadFilterFloatOptimized/0/4/12 2448 ns 2440 ns 285447 +BM_BiquadFilterFloatOptimized/0/4/13 2454 ns 2447 ns 282923 +BM_BiquadFilterFloatOptimized/0/4/14 2359 ns 2352 ns 297360 +BM_BiquadFilterFloatOptimized/0/4/15 2467 ns 2459 ns 286699 +BM_BiquadFilterFloatOptimized/0/4/16 2442 ns 2435 ns 286231 +BM_BiquadFilterFloatOptimized/0/4/17 2446 ns 2438 ns 287105 +BM_BiquadFilterFloatOptimized/0/4/18 2361 ns 2354 ns 298082 +BM_BiquadFilterFloatOptimized/0/4/19 2452 ns 2444 ns 284776 +BM_BiquadFilterFloatOptimized/0/4/20 2459 ns 2451 ns 284338 +BM_BiquadFilterFloatOptimized/0/4/21 2437 ns 2429 ns 289188 +BM_BiquadFilterFloatOptimized/0/4/22 2358 ns 2350 ns 297415 +BM_BiquadFilterFloatOptimized/0/4/23 2459 ns 2451 ns 287279 +BM_BiquadFilterFloatOptimized/0/4/24 2452 ns 2444 ns 281625 +BM_BiquadFilterFloatOptimized/0/4/25 2389 ns 2382 ns 290665 +BM_BiquadFilterFloatOptimized/0/4/26 2360 ns 2354 ns 298354 +BM_BiquadFilterFloatOptimized/0/4/27 2433 ns 2425 ns 288930 +BM_BiquadFilterFloatOptimized/0/4/28 2447 ns 2440 ns 285827 +BM_BiquadFilterFloatOptimized/0/4/29 2450 ns 2444 ns 282857 +BM_BiquadFilterFloatOptimized/0/4/30 2360 ns 2353 ns 297327 +BM_BiquadFilterFloatOptimized/0/4/31 2463 ns 2455 ns 281644 +BM_BiquadFilterFloatOptimized/1/1/1 559 ns 558 ns 1255286 +BM_BiquadFilterFloatOptimized/1/1/2 649 ns 647 ns 1081171 +BM_BiquadFilterFloatOptimized/1/1/3 649 ns 647 ns 1081604 +BM_BiquadFilterFloatOptimized/1/1/4 848 ns 845 ns 828382 +BM_BiquadFilterFloatOptimized/1/1/5 847 ns 844 ns 827653 +BM_BiquadFilterFloatOptimized/1/1/6 848 ns 845 ns 831001 +BM_BiquadFilterFloatOptimized/1/1/7 848 ns 845 ns 828299 +BM_BiquadFilterFloatOptimized/1/1/8 2291 ns 2284 ns 306513 +BM_BiquadFilterFloatOptimized/1/1/9 2297 ns 2290 ns 305585 +BM_BiquadFilterFloatOptimized/1/1/10 2310 ns 2303 ns 304022 +BM_BiquadFilterFloatOptimized/1/1/11 2286 ns 2279 ns 307592 +BM_BiquadFilterFloatOptimized/1/1/12 2263 ns 2256 ns 310010 +BM_BiquadFilterFloatOptimized/1/1/13 2264 ns 2258 ns 310167 +BM_BiquadFilterFloatOptimized/1/1/14 2264 ns 2257 ns 310392 +BM_BiquadFilterFloatOptimized/1/1/15 2264 ns 2257 ns 310298 +BM_BiquadFilterFloatOptimized/1/1/16 2263 ns 2256 ns 310058 +BM_BiquadFilterFloatOptimized/1/1/17 2264 ns 2257 ns 310262 +BM_BiquadFilterFloatOptimized/1/1/18 2263 ns 2256 ns 310337 +BM_BiquadFilterFloatOptimized/1/1/19 2266 ns 2258 ns 310162 +BM_BiquadFilterFloatOptimized/1/1/20 2264 ns 2257 ns 309968 +BM_BiquadFilterFloatOptimized/1/1/21 2264 ns 2256 ns 310341 +BM_BiquadFilterFloatOptimized/1/1/22 2266 ns 2258 ns 310193 +BM_BiquadFilterFloatOptimized/1/1/23 2263 ns 2257 ns 310310 +BM_BiquadFilterFloatOptimized/1/1/24 2265 ns 2257 ns 310182 +BM_BiquadFilterFloatOptimized/1/1/25 2263 ns 2256 ns 309999 +BM_BiquadFilterFloatOptimized/1/1/26 2265 ns 2258 ns 309943 +BM_BiquadFilterFloatOptimized/1/1/27 2264 ns 2257 ns 310175 +BM_BiquadFilterFloatOptimized/1/1/28 2265 ns 2258 ns 310119 +BM_BiquadFilterFloatOptimized/1/1/29 2265 ns 2258 ns 310231 +BM_BiquadFilterFloatOptimized/1/1/30 2266 ns 2258 ns 309976 +BM_BiquadFilterFloatOptimized/1/1/31 2266 ns 2258 ns 310062 +BM_BiquadFilterFloatOptimized/1/2/1 613 ns 611 ns 1144258 +BM_BiquadFilterFloatOptimized/1/2/2 801 ns 798 ns 877198 +BM_BiquadFilterFloatOptimized/1/2/3 800 ns 798 ns 877731 +BM_BiquadFilterFloatOptimized/1/2/4 1508 ns 1503 ns 465854 +BM_BiquadFilterFloatOptimized/1/2/5 1505 ns 1500 ns 466533 +BM_BiquadFilterFloatOptimized/1/2/6 1502 ns 1497 ns 468915 +BM_BiquadFilterFloatOptimized/1/2/7 1505 ns 1501 ns 466371 +BM_BiquadFilterFloatOptimized/1/2/8 2238 ns 2231 ns 313808 +BM_BiquadFilterFloatOptimized/1/2/9 2236 ns 2228 ns 313914 +BM_BiquadFilterFloatOptimized/1/2/10 2235 ns 2228 ns 314155 +BM_BiquadFilterFloatOptimized/1/2/11 2237 ns 2231 ns 313280 +BM_BiquadFilterFloatOptimized/1/2/12 2482 ns 2474 ns 287057 +BM_BiquadFilterFloatOptimized/1/2/13 2491 ns 2483 ns 280960 +BM_BiquadFilterFloatOptimized/1/2/14 2520 ns 2513 ns 283093 +BM_BiquadFilterFloatOptimized/1/2/15 2452 ns 2445 ns 285828 +BM_BiquadFilterFloatOptimized/1/2/16 2503 ns 2495 ns 286339 +BM_BiquadFilterFloatOptimized/1/2/17 2499 ns 2491 ns 285757 +BM_BiquadFilterFloatOptimized/1/2/18 2509 ns 2502 ns 281775 +BM_BiquadFilterFloatOptimized/1/2/19 2491 ns 2483 ns 281628 +BM_BiquadFilterFloatOptimized/1/2/20 2466 ns 2458 ns 289553 +BM_BiquadFilterFloatOptimized/1/2/21 2456 ns 2447 ns 284222 +BM_BiquadFilterFloatOptimized/1/2/22 2502 ns 2494 ns 277208 +BM_BiquadFilterFloatOptimized/1/2/23 2458 ns 2450 ns 281951 +BM_BiquadFilterFloatOptimized/1/2/24 2474 ns 2466 ns 290638 +BM_BiquadFilterFloatOptimized/1/2/25 2473 ns 2465 ns 285659 +BM_BiquadFilterFloatOptimized/1/2/26 2504 ns 2497 ns 278294 +BM_BiquadFilterFloatOptimized/1/2/27 2467 ns 2459 ns 279088 +BM_BiquadFilterFloatOptimized/1/2/28 2487 ns 2479 ns 281668 +BM_BiquadFilterFloatOptimized/1/2/29 2469 ns 2462 ns 274237 +BM_BiquadFilterFloatOptimized/1/2/30 2502 ns 2495 ns 282294 +BM_BiquadFilterFloatOptimized/1/2/31 2438 ns 2430 ns 285546 +BM_BiquadFilterFloatOptimized/1/3/1 1016 ns 1012 ns 691507 +BM_BiquadFilterFloatOptimized/1/3/2 1361 ns 1357 ns 514162 +BM_BiquadFilterFloatOptimized/1/3/3 1359 ns 1355 ns 516211 +BM_BiquadFilterFloatOptimized/1/3/4 2304 ns 2297 ns 304380 +BM_BiquadFilterFloatOptimized/1/3/5 2305 ns 2297 ns 304978 +BM_BiquadFilterFloatOptimized/1/3/6 2291 ns 2283 ns 306726 +BM_BiquadFilterFloatOptimized/1/3/7 2305 ns 2297 ns 304572 +BM_BiquadFilterFloatOptimized/1/3/8 2332 ns 2325 ns 303627 +BM_BiquadFilterFloatOptimized/1/3/9 2313 ns 2305 ns 303341 +BM_BiquadFilterFloatOptimized/1/3/10 2317 ns 2310 ns 303791 +BM_BiquadFilterFloatOptimized/1/3/11 2325 ns 2317 ns 302256 +BM_BiquadFilterFloatOptimized/1/3/12 2828 ns 2819 ns 248120 +BM_BiquadFilterFloatOptimized/1/3/13 2831 ns 2821 ns 248747 +BM_BiquadFilterFloatOptimized/1/3/14 2849 ns 2840 ns 247287 +BM_BiquadFilterFloatOptimized/1/3/15 2829 ns 2821 ns 247794 +BM_BiquadFilterFloatOptimized/1/3/16 2830 ns 2821 ns 248154 +BM_BiquadFilterFloatOptimized/1/3/17 2833 ns 2823 ns 249270 +BM_BiquadFilterFloatOptimized/1/3/18 2853 ns 2843 ns 247248 +BM_BiquadFilterFloatOptimized/1/3/19 2839 ns 2830 ns 248141 +BM_BiquadFilterFloatOptimized/1/3/20 2830 ns 2820 ns 248693 +BM_BiquadFilterFloatOptimized/1/3/21 2832 ns 2823 ns 247991 +BM_BiquadFilterFloatOptimized/1/3/22 2850 ns 2841 ns 246602 +BM_BiquadFilterFloatOptimized/1/3/23 2828 ns 2819 ns 248859 +BM_BiquadFilterFloatOptimized/1/3/24 2835 ns 2826 ns 247837 +BM_BiquadFilterFloatOptimized/1/3/25 2837 ns 2828 ns 246757 +BM_BiquadFilterFloatOptimized/1/3/26 2860 ns 2851 ns 245739 +BM_BiquadFilterFloatOptimized/1/3/27 2830 ns 2821 ns 248257 +BM_BiquadFilterFloatOptimized/1/3/28 2826 ns 2817 ns 249220 +BM_BiquadFilterFloatOptimized/1/3/29 2820 ns 2811 ns 248787 +BM_BiquadFilterFloatOptimized/1/3/30 2853 ns 2844 ns 245666 +BM_BiquadFilterFloatOptimized/1/3/31 2829 ns 2821 ns 248423 +BM_BiquadFilterFloatOptimized/1/4/1 650 ns 648 ns 1080678 +BM_BiquadFilterFloatOptimized/1/4/2 808 ns 806 ns 868623 +BM_BiquadFilterFloatOptimized/1/4/3 810 ns 807 ns 867092 +BM_BiquadFilterFloatOptimized/1/4/4 844 ns 842 ns 831679 +BM_BiquadFilterFloatOptimized/1/4/5 843 ns 841 ns 830964 +BM_BiquadFilterFloatOptimized/1/4/6 845 ns 842 ns 831367 +BM_BiquadFilterFloatOptimized/1/4/7 845 ns 842 ns 830992 +BM_BiquadFilterFloatOptimized/1/4/8 2193 ns 2186 ns 320186 +BM_BiquadFilterFloatOptimized/1/4/9 2194 ns 2187 ns 320621 +BM_BiquadFilterFloatOptimized/1/4/10 2194 ns 2187 ns 320577 +BM_BiquadFilterFloatOptimized/1/4/11 2192 ns 2184 ns 320221 +BM_BiquadFilterFloatOptimized/1/4/12 2471 ns 2462 ns 286817 +BM_BiquadFilterFloatOptimized/1/4/13 2450 ns 2442 ns 283590 +BM_BiquadFilterFloatOptimized/1/4/14 2359 ns 2351 ns 297716 +BM_BiquadFilterFloatOptimized/1/4/15 2472 ns 2464 ns 286187 +BM_BiquadFilterFloatOptimized/1/4/16 2450 ns 2442 ns 291296 +BM_BiquadFilterFloatOptimized/1/4/17 2460 ns 2453 ns 287031 +BM_BiquadFilterFloatOptimized/1/4/18 2358 ns 2350 ns 297376 +BM_BiquadFilterFloatOptimized/1/4/19 2469 ns 2462 ns 286443 +BM_BiquadFilterFloatOptimized/1/4/20 2470 ns 2462 ns 286582 +BM_BiquadFilterFloatOptimized/1/4/21 2420 ns 2412 ns 288366 +BM_BiquadFilterFloatOptimized/1/4/22 2359 ns 2351 ns 297356 +BM_BiquadFilterFloatOptimized/1/4/23 2448 ns 2440 ns 286534 +BM_BiquadFilterFloatOptimized/1/4/24 2458 ns 2450 ns 286994 +BM_BiquadFilterFloatOptimized/1/4/25 2420 ns 2412 ns 287406 +BM_BiquadFilterFloatOptimized/1/4/26 2363 ns 2354 ns 298029 +BM_BiquadFilterFloatOptimized/1/4/27 2421 ns 2414 ns 288812 +BM_BiquadFilterFloatOptimized/1/4/28 2449 ns 2442 ns 286253 +BM_BiquadFilterFloatOptimized/1/4/29 2448 ns 2441 ns 287234 +BM_BiquadFilterFloatOptimized/1/4/30 2361 ns 2354 ns 297402 +BM_BiquadFilterFloatOptimized/1/4/31 2465 ns 2457 ns 287662 +BM_BiquadFilterFloatNonOptimized/0/1/31 2261 ns 2257 ns 310189 +BM_BiquadFilterFloatNonOptimized/0/2/31 4525 ns 4510 ns 155178 +BM_BiquadFilterFloatNonOptimized/0/3/31 6781 ns 6760 ns 103524 +BM_BiquadFilterFloatNonOptimized/0/4/31 9037 ns 9008 ns 77684 +BM_BiquadFilterFloatNonOptimized/0/5/31 11298 ns 11259 ns 62169 +BM_BiquadFilterFloatNonOptimized/0/6/31 13575 ns 13534 ns 51708 +BM_BiquadFilterFloatNonOptimized/0/7/31 15841 ns 15796 ns 44308 +BM_BiquadFilterFloatNonOptimized/0/8/31 18105 ns 18047 ns 38797 +BM_BiquadFilterFloatNonOptimized/0/9/31 20335 ns 20270 ns 34532 +BM_BiquadFilterFloatNonOptimized/0/10/31 22603 ns 22534 ns 31063 +BM_BiquadFilterFloatNonOptimized/0/11/31 24875 ns 24798 ns 28225 +BM_BiquadFilterFloatNonOptimized/0/12/31 27122 ns 27038 ns 25889 +BM_BiquadFilterFloatNonOptimized/0/13/31 29416 ns 29326 ns 23869 +BM_BiquadFilterFloatNonOptimized/0/14/31 31703 ns 31602 ns 22151 +BM_BiquadFilterFloatNonOptimized/0/15/31 33945 ns 33849 ns 20678 +BM_BiquadFilterFloatNonOptimized/0/16/31 36232 ns 36123 ns 19379 +BM_BiquadFilterFloatNonOptimized/0/17/31 38577 ns 38464 ns 18205 +BM_BiquadFilterFloatNonOptimized/0/18/31 41054 ns 40924 ns 17106 +BM_BiquadFilterFloatNonOptimized/0/19/31 43278 ns 43133 ns 16228 +BM_BiquadFilterFloatNonOptimized/0/20/31 45549 ns 45414 ns 15411 +BM_BiquadFilterFloatNonOptimized/0/21/31 48024 ns 47867 ns 14625 +BM_BiquadFilterFloatNonOptimized/0/22/31 50268 ns 50109 ns 13965 +BM_BiquadFilterFloatNonOptimized/0/23/31 52268 ns 52090 ns 13440 +BM_BiquadFilterFloatNonOptimized/0/24/31 54528 ns 54350 ns 12883 +BM_BiquadFilterDoubleOptimized/0/1/31 2264 ns 2258 ns 309965 +BM_BiquadFilterDoubleOptimized/0/2/31 2503 ns 2495 ns 270274 +BM_BiquadFilterDoubleOptimized/0/3/31 2787 ns 2778 ns 251685 +BM_BiquadFilterDoubleOptimized/0/4/31 3175 ns 3165 ns 221339 +BM_BiquadFilterDoubleNonOptimized/0/1/31 2264 ns 2257 ns 310154 +BM_BiquadFilterDoubleNonOptimized/0/2/31 4523 ns 4508 ns 155292 +BM_BiquadFilterDoubleNonOptimized/0/3/31 6778 ns 6756 ns 103599 +BM_BiquadFilterDoubleNonOptimized/0/4/31 9063 ns 9033 ns 77461 + *******************************************************************/ template <typename F> @@ -299,10 +490,11 @@ static void BiquadFilterDoubleArgs(benchmark::internal::Benchmark* b) { } } -BENCHMARK(BM_BiquadFilterDoubleOptimized)->Apply(BiquadFilterDoubleArgs); -BENCHMARK(BM_BiquadFilterDoubleNonOptimized)->Apply(BiquadFilterDoubleArgs); BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterQuickArgs); -BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs); BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterFullArgs); +// Other tests of interest +BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs); +BENCHMARK(BM_BiquadFilterDoubleOptimized)->Apply(BiquadFilterDoubleArgs); +BENCHMARK(BM_BiquadFilterDoubleNonOptimized)->Apply(BiquadFilterDoubleArgs); BENCHMARK_MAIN(); diff --git a/audio_utils/include/audio_utils/BiquadFilter.h b/audio_utils/include/audio_utils/BiquadFilter.h index c2f481b..b3683e3 100644 --- a/audio_utils/include/audio_utils/BiquadFilter.h +++ b/audio_utils/include/audio_utils/BiquadFilter.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H -#define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H +#pragma once #include "intrinsic_utils.h" @@ -36,12 +35,332 @@ #define USE_NEON #endif +// Use dither to prevent subnormals for CPUs that raise an exception. +#pragma push_macro("USE_DITHER") +#undef USE_DITHER + +#if defined(__i386__) || defined(__x86_x64__) +#define USE_DITHER +#endif + namespace android::audio_utils { static constexpr size_t kBiquadNumCoefs = 5; static constexpr size_t kBiquadNumDelays = 2; +/** + * The BiquadDirect2Transpose is a low overhead + * Biquad filter with coefficients b0, b1, b2, a1, a2. + * + * This can be used by itself, but it is preferred for best data management + * to use the BiquadFilter abstraction below. + * + * T is the data type (scalar or vector). + * F is the filter coefficient type. It is either a scalar or vector (matching T). + */ +template <typename T, typename F> +struct BiquadDirect2Transpose { + F coef_[5]; // these are stored with the denominator a's negated. + T s1_; // delay state 1 + T s2_; // delay state 2 + + // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2) + // as expressed by a bitmask. + static inline constexpr size_t required_occupancies_[] = { + 0x1, // constant scale + 0x3, // single zero + 0x7, // double zero + 0x9, // single pole + 0xb, // (11) first order IIR + 0x1b, // (27) double pole + single zero + 0x1f, // (31) second order IIR (full Biquad) + }; + + // Take care the order of arguments - starts with b's then goes to a's. + // The a's are "positive" reference, some filters take negative. + BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2, + const T& s1 = {}, const T& s2 = {}) + // : coef_{b0, b1, b2, -a1, -a2} + : coef_{ b0, + b1, + b2, + intrinsics::vneg(a1), + intrinsics::vneg(a2) } + , s1_{s1} + , s2_{s2} { + } + + // D is the data type. It must be the same element type of T or F. + // Take care the order of input and output. + template<typename D, size_t OCCUPANCY = 0x1f> + __attribute__((always_inline)) // required for 1ch speedup (30% faster) + void process(D* output, const D* input, size_t frames, size_t stride) { + using namespace intrinsics; + // For SSE it is possible to vdup F to T if F is scalar. + const F b0 = coef_[0]; // b0 + const F b1 = coef_[1]; // b1 + const F b2 = coef_[2]; // b2 + const F negativeA1 = coef_[3]; // -a1 + const F negativeA2 = coef_[4]; // -a2 + T s1 = s1_; + T s2 = s2_; + T xn, yn; // OK to declare temps outside loop rather than at the point of initialization. +#ifdef USE_DITHER + constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT + T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration. +#endif + + // Unroll control. Make sure the constexpr remains constexpr :-). + constexpr size_t CHANNELS = sizeof(T) / sizeof(D); + constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2; // below this won't be unrolled. + constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled. + constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT && + CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1; + size_t remainder = 0; + if constexpr (UNROLL_LOOPS > 1) { + remainder = frames % UNROLL_LOOPS; + frames /= UNROLL_LOOPS; + } + + // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4. + // The other alternative is to use a MACRO, but that doesn't read as well. + const auto KERNEL = [&]() __attribute__((always_inline)) { + xn = vld1<T>(input); + input += stride; +#ifdef USE_DITHER + xn = vadd(xn, dither); + dither = vneg(dither); +#endif + + yn = s1; + if constexpr (OCCUPANCY >> 0 & 1) { + yn = vmla(yn, b0, xn); + } + vst1(output, yn); + output += stride; + + s1 = s2; + if constexpr (OCCUPANCY >> 3 & 1) { + s1 = vmla(s1, negativeA1, yn); + } + if constexpr (OCCUPANCY >> 1 & 1) { + s1 = vmla(s1, b1, xn); + } + if constexpr (OCCUPANCY >> 2 & 1) { + s2 = vmul(b2, xn); + } else { + s2 = vdupn<T>(0.f); + } + if constexpr (OCCUPANCY >> 4 & 1) { + s2 = vmla(s2, negativeA2, yn); + } + }; + + while (frames > 0) { + #pragma unroll + for (size_t i = 0; i < UNROLL_LOOPS; ++i) { + KERNEL(); + } + frames--; + } + if constexpr (UNROLL_LOOPS > 1) { + for (size_t i = 0; i < remainder; ++i) { + KERNEL(); + } + } + s1_ = s1; + s2_ = s2; + } +}; + +/** + * A state space formulation of filtering converts a n-th order difference equation update + * to a first order vector difference equation. For the Biquad filter, the state space form + * has improved numerical precision properties with poles near the unit circle as well as + * increased speed due to better parallelization of the state update [1][2]. + * + * [1] Raph Levien: (observerable canonical form) + * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb + * + * [2] Julius O Smith III: (controllable canonical form) + * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html + * + * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d. + * + * + * +------[ d ]--------------------------+ + * | S | + * x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y + * | | + * +----[ A ]-----+ + * + * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form: + * + * y = [C_1 C_2] | S_1 | + d x + * | S_2 | + * + * + * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x + * | S_2 | | A_21 A_22 | | S_2 | | B_2 | + * + * + * A_11 = -a1 + * A_12 = 1 + * A_21 = -a2 + * A_22 = 0 + * + * B_1 = b1 - b0 * a1 + * B_2 = b2 - b0 * a2 + * + * C_1 = 1 + * C_2 = 0 + * + * d = b0 + * + * Implementation details: The state space filter is typically expressed in either observable or + * controllable canonical form [3]. We follow the observable form here. + * Raph [4] discovered that the single channel Biquad implementation can be further optimized + * by doing 2 filter updates at once (improving speed on NEON by about 20%). + * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample, + * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput. + * TODO: consider this optimization. + * + * [3] Mathworks + * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html + * [4] Raph Levien + * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb + * + * The template variables + * T is the data type (scalar or vector). + * F is the filter coefficient type. It is either a scalar or vector (matching T). + */ +template <typename T, typename F> +struct BiquadStateSpace { + F coef_[5]; // these are stored as state-space converted. + T s1_; // delay state 1 + T s2_; // delay state 2 + + // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2) + // as expressed by a bitmask. This must include 31. + static inline constexpr size_t required_occupancies_[] = { + 1, // constant scale + 3, // single zero + 7, // double zero + 9, // single pole + 11, // first order IIR + 27, // double pole + single zero + 31, // second order IIR (full Biquad) + }; + + // Take care the order of arguments - starts with b's then goes to a's. + // The a's are "positive" reference, some filters take negative. + BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2, + const T& s1 = {}, const T& s2 = {}) + // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2} + : coef_{ b0, + intrinsics::vsub(b1, intrinsics::vmul(b0, a1)), + intrinsics::vsub(b2, intrinsics::vmul(b0, a2)), + intrinsics::vneg(a1), + intrinsics::vneg(a2) } + , s1_{s1} + , s2_{s2} { + } + + // D is the data type. It must be the same element type of T or F. + // Take care the order of input and output. + template<typename D, size_t OCCUPANCY = 0x1f> + void process(D* output, const D* input, size_t frames, size_t stride) { + using namespace intrinsics; + const F b0 = coef_[0]; // b0 + const F b1ss = coef_[1]; // b1 - b0 * a1, + const F b2ss = coef_[2]; // b2 - b0 * a2, + const F negativeA1 = coef_[3]; // -a1 + const F negativeA2 = coef_[4]; // -a2 + T s1 = s1_; + T s2 = s2_; + T x, new_s1; // OK to declare temps here rather than at the point of initialization. +#ifdef USE_DITHER + constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT + T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration. +#endif + constexpr bool b0_present = (OCCUPANCY & 0x1) != 0; + constexpr bool a1_present = (OCCUPANCY & 0x8) != 0; + constexpr bool a2_present = (OCCUPANCY & 0x10) != 0; + constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 || + (b0_present && a1_present); + constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 || + (b0_present && a2_present); + + + // Unroll control. Make sure the constexpr remains constexpr :-). + constexpr size_t CHANNELS = sizeof(T) / sizeof(D); + constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1; // below this won't be unrolled. + constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled. + constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT && + CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1; + size_t remainder = 0; + if constexpr (UNROLL_LOOPS > 1) { + remainder = frames % UNROLL_LOOPS; + frames /= UNROLL_LOOPS; + } + + // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4. + // The other alternative is to use a MACRO, but that doesn't read as well. + const auto KERNEL = [&]() __attribute__((always_inline)) { + x = vld1<T>(input); + input += stride; +#ifdef USE_DITHER + x = vadd(x, dither); + dither = vneg(dither); +#endif + // vst1(output, vadd(s1, vmul(b0, x))); + // output += stride; + // new_s1 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s1)), s2); + // s2 = vadd(vmul(b2ss, x), vmul(negativeA2, s1)); + + if constexpr (b0_present) { + vst1(output, vadd(s1, vmul(b0, x))); + } else /* constexpr */ { + vst1(output, s1); + } + output += stride; + new_s1 = s2; + if constexpr (b1ss_present) { + new_s1 = vadd(new_s1, vmul(b1ss, x)); + } + if constexpr (a1_present) { + new_s1 = vadd(new_s1, vmul(negativeA1, s1)); + } + if constexpr (b2ss_present) { + s2 = vmul(b2ss, x); + if constexpr (a2_present) { + s2 = vadd(s2, vmul(negativeA2, s1)); + } + } else if constexpr (a2_present) { + s2 = vmul(negativeA2, s1); + } + s1 = new_s1; + }; + + while (frames > 0) { + #pragma unroll + for (size_t i = 0; i < UNROLL_LOOPS; ++i) { + KERNEL(); + } + frames--; + } + if constexpr (UNROLL_LOOPS > 1) { + for (size_t i = 0; i < remainder; ++i) { + KERNEL(); + } + } + s1_ = s1; + s2_ = s2; + } +}; + namespace details { + // Helper methods for constructing a constexpr array of function pointers. // As function pointers are efficient and have no constructor/destructor // this is preferred over std::function. @@ -105,67 +424,6 @@ static inline void setCoefficients( } } -// For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be -// determined in a constexpr fashion for optimization. - -// Helper which takes a stride to allow column processing of interleaved audio streams. -template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D> -void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride, - size_t channelCount, D *delays, const D *coefs, size_t localStride) { -#if defined(__i386__) || defined(__x86_x64__) - D delta = std::numeric_limits<float>::min() * (1 << 24); -#endif - D b0, b1, b2, negativeA1, negativeA2; - - if constexpr (SAME_COEF_PER_CHANNEL) { - b0 = coefs[0]; - b1 = coefs[1]; - b2 = coefs[2]; - negativeA1 = -coefs[3]; - negativeA2 = -coefs[4]; - } - for (size_t i = 0; i < channelCount; ++i) { - if constexpr (!SAME_COEF_PER_CHANNEL) { - b0 = coefs[0]; - b1 = coefs[localStride]; - b2 = coefs[2 * localStride]; - negativeA1 = -coefs[3 * localStride]; - negativeA2 = -coefs[4 * localStride]; - ++coefs; - } - - D s1n1 = delays[0]; - D s2n1 = delays[localStride]; - const D *input = &in[i]; - D *output = &out[i]; - for (size_t j = frames; j > 0; --j) { - // Adding a delta to avoid subnormal exception handling on the x86/x64 platform; - // this is not a problem with the ARM platform. The delta will not affect the - // precision of the result. -#if defined(__i386__) || defined(__x86_x64__) - const D xn = *input + delta; -#else - const D xn = *input; -#endif - D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1; - s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1; - s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn; - - input += stride; - - *output = yn; - output += stride; - -#if defined(__i386__) || defined(__x86_x64__) - delta = -delta; -#endif - } - delays[0] = s1n1; - delays[localStride] = s2n1; - ++delays; - } -} - // Helper function to zero channels in the input buffer. // This is used for the degenerate coefficient case which results in all zeroes. template <typename D> @@ -180,90 +438,69 @@ void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) { } } -template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D> -void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride, - size_t channelCount, D *delays, const D *coefs, size_t localStride) { - if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero. - zeroChannels(out, frames, stride, channelCount); - return; - } - biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>( - out, in, frames, stride, channelCount, delays, coefs, localStride); -} - -#ifdef USE_NEON - -template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F> -void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride, +template <template <typename, typename> typename FilterType, + size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F> +void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride, size_t channelCount, F *delays, const F *coefs, size_t localStride) { using namespace android::audio_utils::intrinsics; constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T. - T b0, b1, b2, negativeA1, negativeA2; - if constexpr (SAME_COEF_PER_CHANNEL) { - b0 = vdupn<T>(coefs[0]); - b1 = vdupn<T>(coefs[1]); - b2 = vdupn<T>(coefs[2]); - negativeA1 = vneg(vdupn<T>(coefs[3])); - negativeA2 = vneg(vdupn<T>(coefs[4])); - } + const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride; + using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>; + for (size_t i = 0; i < channelCount; i += elements) { - if constexpr (!SAME_COEF_PER_CHANNEL) { - b0 = vld1<T>(coefs); - b1 = vld1<T>(coefs + localStride); - b2 = vld1<T>(coefs + localStride * 2); - negativeA1 = vneg(vld1<T>(coefs + localStride * 3)); - negativeA2 = vneg(vld1<T>(coefs + localStride * 4)); - coefs += elements; - } T s1 = vld1<T>(&delays[0]); T s2 = vld1<T>(&delays[localStride]); - const F *input = &in[i]; - F *output = &out[i]; - for (size_t j = frames; j > 0; --j) { - T xn = vld1<T>(input); - T yn = s1; - if constexpr (OCCUPANCY >> 0 & 1) { - yn = vmla(yn, b0, xn); - } - s1 = s2; - if constexpr (OCCUPANCY >> 3 & 1) { - s1 = vmla(s1, negativeA1, yn); - } - if constexpr (OCCUPANCY >> 1 & 1) { - s1 = vmla(s1, b1, xn); - } - if constexpr (OCCUPANCY >> 2 & 1) { - s2 = vmul(b2, xn); - } else { - s2 = vdupn<T>(0.f); - } - if constexpr (OCCUPANCY >> 4 & 1) { - s2 = vmla(s2, negativeA2, yn); - } + FilterType<T, CoefType> kernel( + vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride), + vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3), + vld1<CoefType>(coefs + coefStride * 4), + s1, s2); + if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements; + kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride); + vst1(&delays[0], kernel.s1_); + vst1(&delays[localStride], kernel.s2_); + delays += elements; + } +} - input += stride; - vst1(output, yn); - output += stride; +// Find the nearest occupancy mask that includes all the desired bits. +template <typename T, size_t N> +static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) { + if (occupancy < 32) { + for (auto test : occupancies) { + if ((occupancy & test) == occupancy) return test; } - vst1(&delays[0], s1); - vst1(&delays[localStride], s2); - delays += elements; } + return 31; } -#define BIQUAD_FILTER_CASE(N, ... /* type */) \ +enum FILTER_OPTION { + FILTER_OPTION_SCALAR_ONLY = (1 << 0), +}; + +// Default biquad type. +template <typename T, typename F> +using BiquadFilterType = BiquadStateSpace<T, F>; + +#define BIQUAD_FILTER_CASE(N, FilterType, ... /* type */) \ case N: { \ - biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \ + using VectorType = __VA_ARGS__; \ + biquad_filter_func_impl< \ + FilterType, \ + nearestOccupancy(OCCUPANCY, \ + FilterType<VectorType, D>::required_occupancies_), \ + SAME_COEF_PER_CHANNEL, VectorType>( \ out + offset, in + offset, frames, stride, remaining, \ delays + offset, c, localStride); \ goto exit; \ } template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D> -void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride, - size_t channelCount, D *delays, const D *coefs, size_t localStride) { +void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride, + size_t channelCount, D *delays, const D *coefs, size_t localStride, + FILTER_OPTION filterOptions) { if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero. zeroChannels(out, frames, stride, channelCount); return; @@ -274,41 +511,53 @@ void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride, // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; }; // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; }; +#ifdef USE_NEON + // use NEON types to ensure we have the proper intrinsic acceleration. + using alt_16_t = float32x4x4_t; + using alt_8_t = float32x4x2_t; + using alt_4_t = float32x4_t; +#else + // Use C++ types, no NEON needed. + using alt_16_t = intrinsics::internal_array_t<float, 16>; + using alt_8_t = intrinsics::internal_array_t<float, 8>; + using alt_4_t = intrinsics::internal_array_t<float, 4>; +#endif + for (size_t offset = 0; offset < channelCount; ) { size_t remaining = channelCount - offset; auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset; + if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar; if constexpr (std::is_same_v<D, float>) { switch (remaining) { default: if (remaining >= 16) { remaining &= ~15; - biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>( + biquad_filter_func_impl< + BiquadFilterType, + nearestOccupancy(OCCUPANCY, + BiquadFilterType<D, D>::required_occupancies_), + SAME_COEF_PER_CHANNEL, alt_16_t>( out + offset, in + offset, frames, stride, remaining, delays + offset, c, localStride); offset += remaining; continue; } break; // case 1 handled at bottom. - BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>) - BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>) - BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>) - BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>) - BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>) - BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>) - BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>) - // We choose the NEON intrinsic type over internal_array for 8 to - // check if there is any performance difference in benchmark (should be similar). - // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>) - BIQUAD_FILTER_CASE(8, float32x4x2_t) - BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>) - BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>) - BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>) - BIQUAD_FILTER_CASE(4, float32x4_t) - // We choose the NEON intrinsic type over internal_array for 4 to - // check if there is any performance difference in benchmark (should be similar). - // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>) - BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>) - BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>) + BIQUAD_FILTER_CASE(15, BiquadFilterType, intrinsics::internal_array_t<float, 15>) + BIQUAD_FILTER_CASE(14, BiquadFilterType, intrinsics::internal_array_t<float, 14>) + BIQUAD_FILTER_CASE(13, BiquadFilterType, intrinsics::internal_array_t<float, 13>) + BIQUAD_FILTER_CASE(12, BiquadFilterType, intrinsics::internal_array_t<float, 12>) + BIQUAD_FILTER_CASE(11, BiquadFilterType, intrinsics::internal_array_t<float, 11>) + BIQUAD_FILTER_CASE(10, BiquadFilterType, intrinsics::internal_array_t<float, 10>) + BIQUAD_FILTER_CASE(9, BiquadFilterType, intrinsics::internal_array_t<float, 9>) + BIQUAD_FILTER_CASE(8, BiquadFilterType, alt_8_t) + BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<float, 7>) + BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<float, 6>) + BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<float, 5>) + BIQUAD_FILTER_CASE(4, BiquadFilterType, alt_4_t) + BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<float, 3>) + BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<float, 2>) + // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>) } } else if constexpr (std::is_same_v<D, double>) { #if defined(__aarch64__) @@ -316,27 +565,34 @@ void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride, default: if (remaining >= 8) { remaining &= ~7; - biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, - intrinsics::internal_array_t<double, 8>>( + biquad_filter_func_impl<BiquadFilterType, + nearestOccupancy(OCCUPANCY, + BiquadFilterType<D, D>::required_occupancies_), + SAME_COEF_PER_CHANNEL, + intrinsics::internal_array_t<double, 8>>( out + offset, in + offset, frames, stride, remaining, delays + offset, c, localStride); offset += remaining; continue; } break; // case 1 handled at bottom. - BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>) - BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>) - BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>) - BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>) - BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>) - BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>) + BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<double, 7>) + BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<double, 6>) + BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<double, 5>) + BIQUAD_FILTER_CASE(4, BiquadFilterType, intrinsics::internal_array_t<double, 4>) + BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<double, 3>) + BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<double, 2>) }; #endif } + scalar: // Essentially the code below is scalar, the same as // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>, // but formulated with NEON intrinsic-like call pattern. - biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>( + biquad_filter_func_impl<BiquadFilterType, + nearestOccupancy(OCCUPANCY, + BiquadFilterType<D, D>::required_occupancies_), + SAME_COEF_PER_CHANNEL, D>( out + offset, in + offset, frames, stride, remaining, delays + offset, c, localStride); offset += remaining; @@ -344,8 +600,6 @@ void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride, exit:; } -#endif // USE_NEON - } // namespace details /** @@ -584,16 +838,14 @@ public: } // Select the proper filtering function from our array. - (void)optimized; // avoid unused variable warning. - mFunc = mFilterFast[category]; // default if we don't have processor optimization. - -#ifdef USE_NEON - /* if constexpr (std::is_same_v<D, float>) */ { - if (optimized) { - mFunc = mFilterNeon[category]; - } + if (optimized) { + mFilterOptions = (details::FILTER_OPTION) + (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY); + } else { + mFilterOptions = (details::FILTER_OPTION) + (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY); } -#endif + mFunc = mFilterFuncs[category]; } /** @@ -603,7 +855,7 @@ public: * \param in pointer to the input data * \param frames number of audio frames to be processed */ - void process(D* out, const D *in, size_t frames) { + void process(D* out, const D* in, size_t frames) { process(out, in, frames, mChannelCount); } @@ -615,10 +867,10 @@ public: * \param frames number of audio frames to be processed * \param stride the total number of samples associated with a frame, if not channelCount. */ - void process(D* out, const D *in, size_t frames, size_t stride) { + void process(D* out, const D* in, size_t frames, size_t stride) { assert(stride >= mChannelCount); mFunc(out, in, frames, stride, mChannelCount, mDelays.data(), - mCoefs.data(), mChannelCount); + mCoefs.data(), mChannelCount, mFilterOptions); } /** @@ -655,7 +907,7 @@ public: auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd); auto delays = mDelays.data() + fromEnd; mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1, - delays, coefs, mChannelCount); + delays, coefs, mChannelCount, mFilterOptions); } auto delays = mDelays.data() + baseIdx; @@ -664,13 +916,13 @@ public: // sliding one audio sample at a time. mFunc(inout, inout, frames - channelBlock + 1, 1 /* stride */, channelBlock, - delays, coefs, mChannelCount); + delays, coefs, mChannelCount, mFilterOptions); // drain data pipe. for (size_t i = 1; i < channelBlock; ++i) { mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i, 1 /* frames */, 1 /* stride */, channelBlock - i, - delays, coefs, mChannelCount); + delays, coefs, mChannelCount, mFilterOptions); } } } @@ -681,7 +933,7 @@ public: auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd); mFunc(inout, inout, frames, 1 /* stride */, 1 /* channelCount */, - mDelays.data() + fromEnd, coefs, mChannelCount); + mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions); } } @@ -746,121 +998,57 @@ private: */ std::vector<D> mDelays; - using filter_func = decltype(details::biquad_filter_fast<0, true, D>); - - /** - * \var filter_func* mFunc - * - * The current filter function selected for the channel occupancy of the Biquad. - */ - filter_func *mFunc; - - // Create a functional wrapper to feed "biquad_filter_fast" to - // make_functional_array() to populate the array. - // - // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients - // b0 b1 b2 a1 a2 (from lsb to msb) - template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL - struct FuncWrap { - template<typename T> - static constexpr size_t nearest() { - // Combine cases to both improve expected performance and reduce code space. - // Some occupancy masks provide worse performance than more occupied masks. - constexpr size_t required_occupancies[] = { - 1, // constant scale - 3, // single zero - 7, // double zero - 9, // single pole - // 11, // first order IIR (unnecessary optimization, close enough to 31). - 27, // double pole + single zero - 31, // second order IIR (full Biquad) - }; - if constexpr (OCCUPANCY < 32) { - for (auto test : required_occupancies) { - if ((OCCUPANCY & test) == OCCUPANCY) return test; - } - } else { - static_assert(intrinsics::dependent_false_v<T>); - } - return 0; // never gets here. - } - - static void func(D* out, const D *in, size_t frames, size_t stride, - size_t channelCount, D *delays, const D *coef, size_t localStride) { - constexpr size_t NEAREST_OCCUPANCY = nearest<D>(); - details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>( - out, in, frames, stride, channelCount, delays, coef, localStride); - } - }; + details::FILTER_OPTION mFilterOptions{}; - /** - * \var mFilterFast + // Consider making a separate delegation class. + /* + * We store an array of functions based on the occupancy. * - * std::array of functions based on coefficient occupancy. + * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients + * b0 b1 b2 a1 a2 (from lsb to msb) * * static inline constexpr std::array<filter_func*, M> mArray = { - * biquad_filter_fast<0>, - * biquad_filter_fast<1>, - * biquad_filter_fast<2>, + * biquad_filter_func<0>, + * biquad_filter_func<1>, + * biquad_filter_func<2>, * ... - * biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>, + * biquad_filter_func<(1 << kBiquadNumCoefs) - 1>, * }; * * Every time the coefficients are changed, we select the processing function from * this table. */ - static inline constexpr auto mFilterFast = - details::make_functional_array< - FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>(); - -#ifdef USE_NEON - // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients - // b0 b1 b2 a1 a2 (from lsb to msb) + // Used to build the functional array. template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL - struct FuncWrapNeon { - template<typename T> - static constexpr size_t nearest() { - // combine cases to both improve expected performance and reduce code space. - // - // This lists the occupancies we will specialize functions for. - constexpr size_t required_occupancies[] = { - 1, // constant scale - 3, // single zero - 7, // double zero - 9, // single pole - 11, // first order IIR - 27, // double pole + single zero - 31, // second order IIR (full Biquad) - }; - if constexpr (OCCUPANCY < 32) { - for (auto test : required_occupancies) { - if ((OCCUPANCY & test) == OCCUPANCY) return test; - } - } else { - static_assert(intrinsics::dependent_false_v<T>); - } - return 0; // never gets here. - } - + struct FuncWrap { static void func(D* out, const D *in, size_t frames, size_t stride, - size_t channelCount, D *delays, const D *coef, size_t localStride) { - constexpr size_t NEAREST_OCCUPANCY = nearest<D>(); - details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>( - out, in, frames, stride, channelCount, delays, coef, localStride); + size_t channelCount, D *delays, const D *coef, size_t localStride, + details::FILTER_OPTION filterOptions) { + constexpr size_t NEAREST_OCCUPANCY = + details::nearestOccupancy( + OCCUPANCY, details::BiquadFilterType<D, D>::required_occupancies_); + details::biquad_filter_func<NEAREST_OCCUPANCY, SC>( + out, in, frames, stride, channelCount, delays, coef, localStride, + filterOptions); } }; - // Neon optimized array of functions. - static inline constexpr auto mFilterNeon = + // Vector optimized array of functions. + static inline constexpr auto mFilterFuncs = details::make_functional_array< - FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>(); -#endif // USE_NEON + FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>(); + /** + * \var filter_func* mFunc + * + * The current filter function selected for the channel occupancy of the Biquad. + * It will be one of mFilterFuncs. + */ + std::decay_t<decltype(mFilterFuncs[0])> mFunc; }; } // namespace android::audio_utils +#pragma pop_macro("USE_DITHER") #pragma pop_macro("USE_NEON") - -#endif // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H diff --git a/audio_utils/include/audio_utils/intrinsic_utils.h b/audio_utils/include/audio_utils/intrinsic_utils.h index ed2b2bb..0c333e0 100644 --- a/audio_utils/include/audio_utils/intrinsic_utils.h +++ b/audio_utils/include/audio_utils/intrinsic_utils.h @@ -78,6 +78,45 @@ struct internal_array_t { using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; }; */ +// add a + b +template<typename T> +static inline T vadd(T a, T b) { + if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { + return a + b; + +#ifdef USE_NEON + } else if constexpr (std::is_same_v<T, float32x2_t>) { + return vadd_f32(a, b); + } else if constexpr (std::is_same_v<T, float32x4_t>) { + return vaddq_f32(a, b); +#if defined(__aarch64__) + } else if constexpr (std::is_same_v<T, float64x2_t>) { + return vaddq_f64(a, b); +#endif +#endif // USE_NEON + + } else /* constexpr */ { + T ret; + auto &[retval] = ret; // single-member struct + const auto &[aval] = a; + const auto &[bval] = b; + if constexpr (std::is_array_v<decltype(retval)>) { +#pragma unroll + for (size_t i = 0; i < std::size(aval); ++i) { + retval[i] = vadd(aval[i], bval[i]); + } + return ret; + } else /* constexpr */ { + auto &[r1, r2] = retval; + const auto &[a1, a2] = aval; + const auto &[b1, b2] = bval; + r1 = vadd(a1, b1); + r2 = vadd(a2, b2); + return ret; + } + } +} + // duplicate float into all elements. template<typename T, typename F> static inline T vdupn(F f) { @@ -156,6 +195,73 @@ static inline T vld1(const F *f) { } } +/** + * Returns c as follows: + * c_i = a_i * b_i if a and b are the same vector type or + * c_i = a_i * b if a is a vector and b is scalar or + * c_i = a * b_i if a is scalar and b is a vector. + */ +template<typename T, typename S, typename F> +static inline T vmla(T a, S b, F c) { + // Both types T and S are non-primitive and they are not equal. T == S handled below. + (void) a; + (void) b; + (void) c; + static_assert(dependent_false_v<T>); +} + +template<typename T, typename F> +static inline T vmla(T a, T b, F c) { + if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { + if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) { + return a + b * c; + } else { + static_assert(dependent_false_v<T>); + } + } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) { + // handle the lane variant +#ifdef USE_NEON + if constexpr (std::is_same_v<T, float32x2_t>) { + return vmla_n_f32(a, b, c); + } else if constexpr (std::is_same_v<T, float32x4_t>) { + return vmlaq_n_f32(a, b,c); +#if defined(__aarch64__) + } else if constexpr (std::is_same_v<T, float64x2_t>) { + return vmlaq_n_f64(a, b); +#endif + } else +#endif // USE_NEON + { + T ret; + auto &[retval] = ret; // single-member struct + const auto &[aval] = a; + const auto &[bval] = b; + if constexpr (std::is_array_v<decltype(retval)>) { +#pragma unroll + for (size_t i = 0; i < std::size(aval); ++i) { + retval[i] = vmla(aval[i], bval[i], c); + } + return ret; + } else /* constexpr */ { + auto &[r1, r2] = retval; + const auto &[a1, a2] = aval; + const auto &[b1, b2] = bval; + r1 = vmla(a1, b1, c); + r2 = vmla(a2, b2, c); + return ret; + } + } + } else { + // Both types T and F are non-primitive and they are not equal. + static_assert(dependent_false_v<T>); + } +} + +template<typename T, typename F> +static inline T vmla(T a, F b, T c) { + return vmla(a, c, b); +} + // fused multiply-add a + b * c template<typename T> static inline T vmla(T a, T b, T c) { @@ -197,7 +303,57 @@ static inline T vmla(T a, T b, T c) { } } -// multiply a * b +/** + * Returns c as follows: + * c_i = a_i * b_i if a and b are the same vector type or + * c_i = a_i * b if a is a vector and b is scalar or + * c_i = a * b_i if a is scalar and b is a vector. + */ +template<typename T, typename F> +static inline auto vmul(T a, F b) { + if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { + if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) { + return a * b; + } else /* constexpr */ { + return vmul(b, a); // we prefer T to be the vector/struct form. + } + } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) { + // handle the lane variant +#ifdef USE_NEON + if constexpr (std::is_same_v<T, float32x2_t>) { + return vmul_n_f32(a, b); + } else if constexpr (std::is_same_v<T, float32x4_t>) { + return vmulq_n_f32(a, b); +#if defined(__aarch64__) + } else if constexpr (std::is_same_v<T, float64x2_t>) { + return vmulq_n_f64(a, b); +#endif + } else +#endif // USE_NEON + { + T ret; + auto &[retval] = ret; // single-member struct + const auto &[aval] = a; + if constexpr (std::is_array_v<decltype(retval)>) { +#pragma unroll + for (size_t i = 0; i < std::size(aval); ++i) { + retval[i] = vmul(aval[i], b); + } + return ret; + } else /* constexpr */ { + auto &[r1, r2] = retval; + const auto &[a1, a2] = aval; + r1 = vmul(a1, b); + r2 = vmul(a2, b); + return ret; + } + } + } else { + // Both types T and F are non-primitive and they are not equal. + static_assert(dependent_false_v<T>); + } +} + template<typename T> static inline T vmul(T a, T b) { if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { @@ -308,6 +464,45 @@ static inline void vst1(F *f, T a) { } } +// subtract a - b +template<typename T> +static inline T vsub(T a, T b) { + if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { + return a - b; + +#ifdef USE_NEON + } else if constexpr (std::is_same_v<T, float32x2_t>) { + return vsub_f32(a, b); + } else if constexpr (std::is_same_v<T, float32x4_t>) { + return vsubq_f32(a, b); +#if defined(__aarch64__) + } else if constexpr (std::is_same_v<T, float64x2_t>) { + return vsubq_f64(a, b); +#endif +#endif // USE_NEON + + } else /* constexpr */ { + T ret; + auto &[retval] = ret; // single-member struct + const auto &[aval] = a; + const auto &[bval] = b; + if constexpr (std::is_array_v<decltype(retval)>) { +#pragma unroll + for (size_t i = 0; i < std::size(aval); ++i) { + retval[i] = vsub(aval[i], bval[i]); + } + return ret; + } else /* constexpr */ { + auto &[r1, r2] = retval; + const auto &[a1, a2] = aval; + const auto &[b1, b2] = bval; + r1 = vsub(a1, b1); + r2 = vsub(a2, b2); + return ret; + } + } +} + } // namespace android::audio_utils::intrinsics #pragma pop_macro("USE_NEON") diff --git a/audio_utils/tests/biquad_filter_tests.cpp b/audio_utils/tests/biquad_filter_tests.cpp index 3be2a5d..f90b863 100644 --- a/audio_utils/tests/biquad_filter_tests.cpp +++ b/audio_utils/tests/biquad_filter_tests.cpp @@ -29,12 +29,13 @@ using namespace android::audio_utils; /************************************************************************************ * Reference data, must not change. - * The reference output data is from running in matlab y = filter(b, a, x), where - * b = [2.0f, 3.0f] - * a = [1.0f, 0.2f] - * x = [-0.1f, -0.2f, -0.3f, -0.4f, -0.5f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f] - * The output y = [-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f, - * -0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f]. + * The reference output data is from running in matlab or octave y = filter(b, a, x), where + * b = [2.0 3.0 4.0] + * a = [1.0 0.2 0.3] + * x = [-0.1 -0.2 -0.3 -0.4 -0.5 0.1 0.2 0.3 0.4 0.5] + * filter(b, a, x) + * + * The output y = [-0.2 -0.66 -1.4080 -2.0204 -2.5735 -1.7792 -0.1721 2.1682 2.1180 2.3259]. * The reference data construct the input and output as 2D array so that it can be * use to practice calling BiquadFilter::process multiple times. ************************************************************************************/ @@ -43,11 +44,12 @@ constexpr size_t PERIOD = 2; constexpr float INPUT[PERIOD][FRAME_COUNT] = { {-0.1f, -0.2f, -0.3f, -0.4f, -0.5f}, {0.1f, 0.2f, 0.3f, 0.4f, 0.5f}}; +// COEFS in order of [ b0 b1 b2 a1 a2 ], normalized form where a0 = 1. constexpr std::array<float, kBiquadNumCoefs> COEFS = { - 2.0f, 3.0f, 0.0f, 0.2f, 0.0f }; + 2.0f, 3.0f, 4.0f, 0.2f, 0.3f }; constexpr float OUTPUT[PERIOD][FRAME_COUNT] = { - {-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f}, - {-0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f}}; + {-0.2f, -0.66f, -1.4080f, -2.0204f, -2.5735f}, + {-1.7792f, -0.1721f, 2.1682f, 2.1180f, 2.3259f}}; constexpr float EPS = 1e-4f; template <typename S, typename D> diff --git a/audio_utils/tests/intrinsic_tests.cpp b/audio_utils/tests/intrinsic_tests.cpp index 6a16747..d9686ef 100644 --- a/audio_utils/tests/intrinsic_tests.cpp +++ b/audio_utils/tests/intrinsic_tests.cpp @@ -25,6 +25,13 @@ class IntrisicUtilsTest : public ::testing::Test { }; using FloatTypes = ::testing::Types<float, double>; TYPED_TEST_CASE(IntrisicUtilsTest, FloatTypes); +TYPED_TEST(IntrisicUtilsTest, vadd) { + constexpr TypeParam a = 0.25f; + constexpr TypeParam b = 0.5f; + constexpr TypeParam result = a + b; + ASSERT_EQ(result, android::audio_utils::intrinsics::vadd(a, b)); +} + TYPED_TEST(IntrisicUtilsTest, vdupn) { constexpr TypeParam value = 1.f; ASSERT_EQ(value, android::audio_utils::intrinsics::vdupn<TypeParam>(value)); @@ -62,3 +69,10 @@ TYPED_TEST(IntrisicUtilsTest, vst1) { &destination, android::audio_utils::intrinsics::vdupn<TypeParam>(value)); ASSERT_EQ(value, destination); } + +TYPED_TEST(IntrisicUtilsTest, vsub) { + constexpr TypeParam a = 1.25f; + constexpr TypeParam b = 1.5f; + constexpr TypeParam result = a - b; + ASSERT_EQ(result, android::audio_utils::intrinsics::vsub(a, b)); +} |