diff --git a/CMakeLists.txt b/CMakeLists.txt index 5118377..46458c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,15 +1,20 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.1) project(uavs3d) +option(COMPILE_10BIT "Enable 10bit streams decoding support." OFF) + +set(CMAKE_C_STANDARD 99) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + aux_source_directory(./test DIR_SRC_TEST) -set_source_files_properties(${DIR_SRC_TEST} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -std=c99 -O3") add_subdirectory(./source) add_executable(uavs3dec ${DIR_SRC_TEST}) -target_link_libraries(uavs3dec m) +if (NOT MSVC) + target_link_libraries(uavs3dec m) +endif() target_link_libraries(uavs3dec uavs3d) #target_link_libraries(uavs3dec dl) - diff --git a/COPYING b/COPYING index 409d303..ce30f0f 100644 --- a/COPYING +++ b/COPYING @@ -1,4 +1,4 @@ -Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] +Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] All rights reserved. @@ -9,10 +9,7 @@ modification, are permitted provided that the following conditions are met: 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. All advertising materials mentioning features or use of this software - must display the following acknowledgement: - This product includes the software uAVS3d developed by Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation. -4. Neither the name of the organizations (Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) +3. Neither the name of the organizations (Peking University Shenzhen Graduate School, Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/README.md b/README.md index e46ada8..9ce6a2a 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,16 @@ 2) supports to compile for Android/IOS/Windows/Linux/MacOS systems. 3) optimized for ARMv7/ARMv8/SSE4/AVX2 chips. 4) 10bit decoding on all supported platforms. - + 5) The uavs3 codec has supported x86 and arm platforms, and has been tested and verified on the Kunpeng processor. + 6) The ARM platform recommends the Kunpeng processor. # license Copyright reserved by “Peking University Shenzhen Graduate School”, “Peng Cheng Laboratory”, and “Guangdong Bohua UHD Innovation Corporation”

- This program is a free software. You can redistribute it and/or modify it under the terms of the BSD 4-clause license.
+ This program is a free software. You can redistribute it and/or modify it under the terms of the BSD 3-clause license.
For more details, please view the file "COPYING" in the project. # compile The default configuration only support 8bit decoding.
- To support 10bit streams decoding, edit source/decore/com_def.h : #define BIT_DEPTH 10 + To support 10bit streams decoding: cmake -DCOMPILE_10BIT=1 ## windows Prerequisites: @@ -22,17 +23,20 @@ Prerequisites: build: 1. ./version.bat (to generate version.h) 2. solution file: build/x86_windows/uavs3d.sln + + To support 10bit streams decoding, edit source/decore/com_def.h : #define COMPILE_10BIT 1 ## linux/mac Prerequisites: 1. gawk (http://www.gnu.org/software/gawk/) - 2. CMake (https://cmake.org) version 2.8 or higher + 2. CMake (https://cmake.org) version 3.1 or higher Build: 1. mkdir build/linux - 2. cd build/linux && cmake ../.. + 2. cd build/linux && cmake -DCOMPILE_10BIT=0 ../.. 3. make && make install + To support 10bit streams decoding: cmake -DCOMPILE_10BIT=1 to build shared library, set BUILD_SHARED_LIBS=1 please. ## ios @@ -40,8 +44,11 @@ Prerequisites: XCode Build: -1. ./version.sh (generate the version.h) -2. xcode solution file: build/ios/uavs3d.xcodeproj + 1. ./version.sh (generate the version.h) + 2. xcode solution file: build/ios/uavs3d.xcodeproj + + To support 10bit streams decoding: + Find Xcode -> PROJECT -> Build Settings -> Preprocessor Macros, add COMPILE_10BIT=1 ## android Prerequisites: @@ -51,8 +58,10 @@ Build ndk library or executable file: 1. ./version.sh (generate the version.h) 2. cd build/android/ndk/jni 3. $NDK_PATH/ndk-build + + To support 10bit streams decoding: edit build/android/ndk/jni/uavs3d_main.mk: -The executable application for arm64-v8a is generated by default.
To generate static or shared library for other platforms, modify correlative options in Android.mk and Application.mk. + LOCAL_CFLAGS += -DCOMPILE_10BIT=1 # Run tests ## window/linux/mac/android diff --git a/build/android/ndk/jni/Android.mk b/build/android/ndk/jni/Android.mk index fe79947..88d57f4 100644 --- a/build/android/ndk/jni/Android.mk +++ b/build/android/ndk/jni/Android.mk @@ -5,53 +5,26 @@ SRC_PATH := ../../../../source INCLUDE_PATH := ../../../../source/decore ### Name of the local module -include $(CLEAR_VARS) -LOCAL_MODULE := uavs3d +include $(LOCAL_PATH)/uavs3d_clear_vars.mk +LOCAL_MODULE := uavs3d-static +LOCAL_MODULE_FILENAME := libuavs3d +include $(LOCAL_PATH)/uavs3d_main.mk +include $(BUILD_STATIC_LIBRARY) + +include $(LOCAL_PATH)/uavs3d_clear_vars.mk +LOCAL_MODULE := uavs3d-shared +LOCAL_MODULE_FILENAME := libuavs3d LOCAL_LDLIBS:=-L$(SYSROOT)/usr/lib -lm -llog - -### for posix pthread -#LOCAL_SHARED_LIBRARIES := libcutil - -### include search path when compiling all sources (C,C++,Assembly) -LOCAL_C_INCLUDES +=$(INCLUDE_PATH) \ - $(LOCAL_PATH)/../app - -### c source code -uavs3d_srcs_c += $(SRC_PATH)/decore/alf.c -uavs3d_srcs_c += $(SRC_PATH)/decore/deblock.c -uavs3d_srcs_c += $(SRC_PATH)/decore/inter_pred.c -uavs3d_srcs_c += $(SRC_PATH)/decore/intra_pred.c -uavs3d_srcs_c += $(SRC_PATH)/decore/inv_trans.c -uavs3d_srcs_c += $(SRC_PATH)/decore/pic_manager.c -uavs3d_srcs_c += $(SRC_PATH)/decore/recon.c -uavs3d_srcs_c += $(SRC_PATH)/decore/sao.c -uavs3d_srcs_c += $(SRC_PATH)/decore/com_table.c -uavs3d_srcs_c += $(SRC_PATH)/decore/threadpool.c -uavs3d_srcs_c += $(SRC_PATH)/decore/win32thread.c -uavs3d_srcs_c += $(SRC_PATH)/decore/com_util.c -uavs3d_srcs_c += $(SRC_PATH)/decoder/uavs3d.c -uavs3d_srcs_c += $(SRC_PATH)/decoder/bitstream.c -uavs3d_srcs_c += $(SRC_PATH)/decoder/parser.c -uavs3d_srcs_c += $(SRC_PATH)/decoder/dec_util.c - -LOCAL_CFLAGS += -O3 -fPIC -std=gnu99 LOCAL_LDFLAGS += -fPIC +include $(LOCAL_PATH)/uavs3d_main.mk +include $(BUILD_SHARED_LIBRARY) + -#if build_executable +include $(LOCAL_PATH)/uavs3d_clear_vars.mk +LOCAL_MODULE := uavs3d +LOCAL_LDLIBS:=-L$(SYSROOT)/usr/lib -lm -llog LOCAL_CFLAGS += -pie -fPIE LOCAL_LDFLAGS += -pie -fPIE -uavs3d_srcs_test+= $(SRC_PATH)/../test/utest.c -#endif - -#if build armv7a -#LOCAL_CFLAGS += -mfpu=neon -#include $(LOCAL_PATH)/uavs3d_armv7a.mk -#elif build arm64 -include $(LOCAL_PATH)/uavs3d_arm64.mk -#endif - -LOCAL_SRC_FILES := $(uavs3d_srcs_c) $(uavs3d_srcs_arm) $(uavs3d_srcs_test) - -#include $(BUILD_SHARED_LIBRARY) -#include $(BUILD_STATIC_LIBRARY) +uavs3d_srcs_test+= $(SRC_PATH)/../test/utest.c +include $(LOCAL_PATH)/uavs3d_main.mk include $(BUILD_EXECUTABLE) diff --git a/build/android/ndk/jni/Application.mk b/build/android/ndk/jni/Application.mk index 292946e..29e7e02 100644 --- a/build/android/ndk/jni/Application.mk +++ b/build/android/ndk/jni/Application.mk @@ -1,8 +1,9 @@ # APP_ABI := armeabi-v7a - APP_ABI := arm64-v8a +# APP_ABI := arm64-v8a # APP_ABI := armeabi # APP_ABI := x86 # APP_ABI := x86_64 +APP_ABI := all APP_OPTIM := release # TARGET_BUILD_TYPE=release diff --git a/build/android/ndk/jni/uavs3d_avx2.mk b/build/android/ndk/jni/uavs3d_avx2.mk new file mode 100644 index 0000000..d80401a --- /dev/null +++ b/build/android/ndk/jni/uavs3d_avx2.mk @@ -0,0 +1,11 @@ + +AVX_SRC_PATH:=../../../../source/decore/avx2 + +uavs3d_srcs_avx += $(AVX_SRC_PATH)/alf_avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/inter_pred_avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/intra_pred_avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/itrans_avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/pixel_avx2.c +uavs3d_srcs_avx += $(AVX_SRC_PATH)/sao_avx2.c + diff --git a/build/android/ndk/jni/uavs3d_clear_vars.mk b/build/android/ndk/jni/uavs3d_clear_vars.mk new file mode 100644 index 0000000..0c47cfc --- /dev/null +++ b/build/android/ndk/jni/uavs3d_clear_vars.mk @@ -0,0 +1,6 @@ +include $(CLEAR_VARS) +uavs3d_srcs_c := +uavs3d_srcs_test := +uavs3d_srcs_arm := +uavs3d_srcs_sse := +uavs3d_srcs_avx := diff --git a/build/android/ndk/jni/uavs3d_main.mk b/build/android/ndk/jni/uavs3d_main.mk new file mode 100644 index 0000000..6506e89 --- /dev/null +++ b/build/android/ndk/jni/uavs3d_main.mk @@ -0,0 +1,61 @@ + +### for posix pthread +#LOCAL_SHARED_LIBRARIES := libcutil + +### include search path when compiling all sources (C,C++,Assembly) +LOCAL_C_INCLUDES +=$(INCLUDE_PATH) \ + $(LOCAL_PATH)/../app + +### c source code +uavs3d_srcs_c += $(SRC_PATH)/decore/alf.c +uavs3d_srcs_c += $(SRC_PATH)/decore/deblock.c +uavs3d_srcs_c += $(SRC_PATH)/decore/inter_pred.c +uavs3d_srcs_c += $(SRC_PATH)/decore/intra_pred.c +uavs3d_srcs_c += $(SRC_PATH)/decore/inv_trans.c +uavs3d_srcs_c += $(SRC_PATH)/decore/pic_manager.c +uavs3d_srcs_c += $(SRC_PATH)/decore/recon.c +uavs3d_srcs_c += $(SRC_PATH)/decore/sao.c +uavs3d_srcs_c += $(SRC_PATH)/decore/com_table.c +uavs3d_srcs_c += $(SRC_PATH)/decore/threadpool.c +uavs3d_srcs_c += $(SRC_PATH)/decore/win32thread.c +uavs3d_srcs_c += $(SRC_PATH)/decore/com_util.c +uavs3d_srcs_c += $(SRC_PATH)/decoder/uavs3d.c +uavs3d_srcs_c += $(SRC_PATH)/decoder/bitstream.c +uavs3d_srcs_c += $(SRC_PATH)/decoder/parser.c +uavs3d_srcs_c += $(SRC_PATH)/decoder/dec_util.c + + +LOCAL_CFLAGS += -O3 -fPIC -std=gnu99 -I../../../source/decore + +### To support 10bit streams decoding: edit it to -DCOMPILE_10BIT=1 +LOCAL_CFLAGS += -DCOMPILE_10BIT=0 + +ifeq ($(TARGET_ARCH),arm) + ifeq ($(TARGET_ARCH_ABI), armeabi-v7a) + # build armv7a + LOCAL_CFLAGS += -mfpu=neon -D_armv7a + include $(LOCAL_PATH)/uavs3d_armv7a.mk + endif +endif + +ifeq ($(TARGET_ARCH),arm64) + # build arm64 + LOCAL_CFLAGS += -D_arm64 + include $(LOCAL_PATH)/uavs3d_arm64.mk +endif + +ifeq ($(TARGET_ARCH),x86) + # build x86 + LOCAL_CFLAGS += -msse4.2 -mavx2 + include $(LOCAL_PATH)/uavs3d_sse2.mk + include $(LOCAL_PATH)/uavs3d_avx2.mk +endif + +ifeq ($(TARGET_ARCH),x86_64) + # build x86_64 + LOCAL_CFLAGS += -msse4.2 -mavx2 + include $(LOCAL_PATH)/uavs3d_sse2.mk + include $(LOCAL_PATH)/uavs3d_avx2.mk +endif + +LOCAL_SRC_FILES := $(uavs3d_srcs_c) $(uavs3d_srcs_arm) $(uavs3d_srcs_sse) $(uavs3d_srcs_avx) $(uavs3d_srcs_test) diff --git a/build/android/ndk/jni/uavs3d_sse2.mk b/build/android/ndk/jni/uavs3d_sse2.mk new file mode 100644 index 0000000..1f8847a --- /dev/null +++ b/build/android/ndk/jni/uavs3d_sse2.mk @@ -0,0 +1,11 @@ + +SSE_SRC_PATH:=../../../../source/decore/sse + +uavs3d_srcs_sse += $(SSE_SRC_PATH)/alf_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/deblock_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/inter_pred_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/intra_pred_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/itrans_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/pixel_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/sao_sse.c +uavs3d_srcs_sse += $(SSE_SRC_PATH)/sse.c diff --git a/build/x86_windows/common.vcxproj b/build/vs2017/common.vcxproj similarity index 61% rename from build/x86_windows/common.vcxproj rename to build/vs2017/common.vcxproj index c8cd533..95de5c4 100644 --- a/build/x86_windows/common.vcxproj +++ b/build/vs2017/common.vcxproj @@ -1,157 +1,249 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {3F9C7116-C287-40D7-865C-D8C89CF4FF31} - Win32Proj - com_lib_vs17 - common - 10.0.17763.0 - - - - StaticLibrary - true - MultiByte - v141 - - - StaticLibrary - false - true - MultiByte - v141 - - - - - - - - - - - - - ..\..\lib\ - - - $(ProjectName) - $(Platform)\$(Configuration)\$(ProjectName)\ - - - ..\..\lib\ - $(ProjectName) - $(Platform)\$(Configuration)\$(ProjectName)\ - - - - NotUsing - Level3 - Disabled - WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS - ..\..\source\decore - - - - - CompileAsC - Prompt - $(IntDir)vc$(PlatformToolsetVersion).pdb - MultiThreadedDebug - true - /arch:AVX %(AdditionalOptions) - - - Windows - true - - - ..\..\lib\$(ProjectName).lib - - - - - Level3 - NotUsing - MaxSpeed - true - true - WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions); - ..\..\source\decore - ProgramDatabase - - - - - CompileAsC - Prompt - $(IntDir)vc$(PlatformToolsetVersion).pdb - MultiThreaded - true - /arch:AVX %(AdditionalOptions) - - - Windows - true - true - true - - - ..\..\lib\$(ProjectName).lib - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {3F9C7116-C287-40D7-865C-D8C89CF4FF31} + Win32Proj + com_lib_vs17 + common + 10.0.16299.0 + + + + StaticLibrary + true + MultiByte + v141 + + + StaticLibrary + true + MultiByte + v141 + + + StaticLibrary + false + true + MultiByte + v141 + + + StaticLibrary + false + true + MultiByte + v141 + + + + + + + + + + + + + + + + + + + ..\..\lib\ + + + $(ProjectName) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + $(ProjectName) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + ..\..\lib\ + $(ProjectName) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + $(ProjectName) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + NotUsing + Level3 + Disabled + WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS + ..\..\source\decore + + + + + CompileAsC + Prompt + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreadedDebug + true + /arch:AVX %(AdditionalOptions) + + + Windows + true + + + ..\..\lib\$(ProjectName).lib + + + + + NotUsing + Level3 + Disabled + WIN64;X86F;_DEBUG;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS + ..\..\source\decore + + + + + CompileAsC + Prompt + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreadedDebug + true + /arch:AVX %(AdditionalOptions) + + + Windows + true + + + ..\..\lib\$(ProjectName).lib + + + + + Level3 + NotUsing + MaxSpeed + true + true + WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions); + ..\..\source\decore + ProgramDatabase + + + + + CompileAsC + Prompt + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreaded + true + /arch:AVX %(AdditionalOptions) + + + Windows + true + true + true + + + ..\..\lib\$(ProjectName).lib + + + + + Level3 + NotUsing + MaxSpeed + true + true + WIN64;X86F;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions); + ..\..\source\decore + ProgramDatabase + + + + + CompileAsC + Prompt + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreaded + true + /arch:AVX %(AdditionalOptions) + + + Windows + true + true + true + + + ..\..\lib\$(ProjectName).lib + + + + + \ No newline at end of file diff --git a/build/x86_windows/common.vcxproj.filters b/build/vs2017/common.vcxproj.filters similarity index 100% rename from build/x86_windows/common.vcxproj.filters rename to build/vs2017/common.vcxproj.filters diff --git a/build/x86_windows/libuavs3d.vcxproj b/build/vs2017/libuavs3d.vcxproj similarity index 55% rename from build/x86_windows/libuavs3d.vcxproj rename to build/vs2017/libuavs3d.vcxproj index d92a51f..bb42a6c 100644 --- a/build/x86_windows/libuavs3d.vcxproj +++ b/build/vs2017/libuavs3d.vcxproj @@ -1,115 +1,193 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - - - - - {40B445E8-306A-4C77-9B19-FC76C2379F79} - dec_lib - 10.0.17763.0 - libuavs3d - - - - DynamicLibrary - true - v141 - MultiByte - - - DynamicLibrary - false - v141 - true - MultiByte - - - - - - - - - - - - - - - $(Platform)\$(Configuration)\$(ProjectName)\ - ..\..\bin - ..\..\lib;$(LibraryPath) - - - $(Platform)\$(Configuration)\$(ProjectName)\ - ..\..\bin - ..\..\lib;$(LibraryPath) - - - - Level3 - Disabled - true - ..\..\source\decore;..\..\source\decoder - MultiThreadedDebug - $(IntDir)vc$(PlatformToolsetVersion).pdb - WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG - true - /arch:AVX %(AdditionalOptions) - - - true - common.lib;%(AdditionalDependencies) - NotSet - 1.0 - - - - - Level3 - MaxSpeed - true - true - true - ..\..\source\decore;..\..\source\decoder - MultiThreaded - CompileAsC - $(IntDir)vc$(PlatformToolsetVersion).pdb - WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS; - true - /arch:AVX %(AdditionalOptions) - - - true - true - true - common.lib;%(AdditionalDependencies) - NotSet - 1.0 - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + + + + {40B445E8-306A-4C77-9B19-FC76C2379F79} + dec_lib + 10.0.16299.0 + libuavs3d + + + + DynamicLibrary + true + v141 + MultiByte + + + DynamicLibrary + true + v141 + MultiByte + + + DynamicLibrary + false + v141 + true + MultiByte + + + DynamicLibrary + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + $(Platform)\$(Configuration)\$(ProjectName)\ + ..\..\bin + ..\..\lib;$(LibraryPath) + + + ..\..\lib;$(LibraryPath) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + $(Platform)\$(Configuration)\$(ProjectName)\ + ..\..\bin + ..\..\lib;$(LibraryPath) + + + ..\..\lib;$(LibraryPath) + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + Level3 + Disabled + true + ..\..\source\decore;..\..\source\decoder + MultiThreadedDebug + $(IntDir)vc$(PlatformToolsetVersion).pdb + WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG + true + /arch:AVX %(AdditionalOptions) + + + true + common.lib;%(AdditionalDependencies) + NotSet + 1.0 + + + + + Level3 + Disabled + true + ..\..\source\decore;..\..\source\decoder + MultiThreadedDebug + $(IntDir)vc$(PlatformToolsetVersion).pdb + WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS;_DEBUG + true + /arch:AVX %(AdditionalOptions) + + + true + common.lib;%(AdditionalDependencies) + NotSet + 1.0 + + + + + Level3 + MaxSpeed + true + true + true + ..\..\source\decore;..\..\source\decoder + MultiThreaded + CompileAsC + $(IntDir)vc$(PlatformToolsetVersion).pdb + WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS; + true + /arch:AVX %(AdditionalOptions) + + + true + true + true + common.lib;%(AdditionalDependencies) + NotSet + 1.0 + + + + + Level3 + MaxSpeed + true + true + true + ..\..\source\decore;..\..\source\decoder + MultiThreaded + CompileAsC + $(IntDir)vc$(PlatformToolsetVersion).pdb + WIN64;;%(PreprocessorDefinitions);UAVS3D_EXPORTS; + true + /arch:AVX %(AdditionalOptions) + + + true + true + true + common.lib;%(AdditionalDependencies) + NotSet + 1.0 + + + + + \ No newline at end of file diff --git a/build/x86_windows/uavs3d.sln b/build/vs2017/uavs3d.sln similarity index 68% rename from build/x86_windows/uavs3d.sln rename to build/vs2017/uavs3d.sln index e6d34c4..d098d33 100644 --- a/build/x86_windows/uavs3d.sln +++ b/build/vs2017/uavs3d.sln @@ -1,47 +1,61 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.27130.2026 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "common", "common.vcxproj", "{3F9C7116-C287-40D7-865C-D8C89CF4FF31}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "uavs3d", "uavs3d.vcxproj", "{798F7D68-C94D-41AF-86A4-98F7726D172C}" - ProjectSection(ProjectDependencies) = postProject - {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31} - {40B445E8-306A-4C77-9B19-FC76C2379F79} = {40B445E8-306A-4C77-9B19-FC76C2379F79} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libuavs3d", "libuavs3d.vcxproj", "{40B445E8-306A-4C77-9B19-FC76C2379F79}" - ProjectSection(ProjectDependencies) = postProject - {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31} - EndProjectSection -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.ActiveCfg = Debug|x64 - {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.Build.0 = Debug|x64 - {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.ActiveCfg = Release|x64 - {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.Build.0 = Release|x64 - {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.ActiveCfg = Debug|x64 - {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.Build.0 = Debug|x64 - {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.ActiveCfg = Release|x64 - {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.Build.0 = Release|x64 - {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.ActiveCfg = Debug|x64 - {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.Build.0 = Debug|x64 - {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.ActiveCfg = Release|x64 - {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {ED69324B-A55F-49DC-91D3-5F1D34DF875C} - EndGlobalSection - GlobalSection(Performance) = preSolution - HasPerformanceSessions = true - EndGlobalSection -EndGlobal + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26228.4 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "common", "common.vcxproj", "{3F9C7116-C287-40D7-865C-D8C89CF4FF31}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "uavs3d", "uavs3d.vcxproj", "{798F7D68-C94D-41AF-86A4-98F7726D172C}" + ProjectSection(ProjectDependencies) = postProject + {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31} + {40B445E8-306A-4C77-9B19-FC76C2379F79} = {40B445E8-306A-4C77-9B19-FC76C2379F79} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libuavs3d", "libuavs3d.vcxproj", "{40B445E8-306A-4C77-9B19-FC76C2379F79}" + ProjectSection(ProjectDependencies) = postProject + {3F9C7116-C287-40D7-865C-D8C89CF4FF31} = {3F9C7116-C287-40D7-865C-D8C89CF4FF31} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.ActiveCfg = Debug|x64 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x64.Build.0 = Debug|x64 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x86.ActiveCfg = Debug|Win32 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Debug|x86.Build.0 = Debug|Win32 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.ActiveCfg = Release|x64 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x64.Build.0 = Release|x64 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x86.ActiveCfg = Release|Win32 + {3F9C7116-C287-40D7-865C-D8C89CF4FF31}.Release|x86.Build.0 = Release|Win32 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.ActiveCfg = Debug|x64 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x64.Build.0 = Debug|x64 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x86.ActiveCfg = Debug|Win32 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Debug|x86.Build.0 = Debug|Win32 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.ActiveCfg = Release|x64 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x64.Build.0 = Release|x64 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x86.ActiveCfg = Release|Win32 + {798F7D68-C94D-41AF-86A4-98F7726D172C}.Release|x86.Build.0 = Release|Win32 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.ActiveCfg = Debug|x64 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x64.Build.0 = Debug|x64 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x86.ActiveCfg = Debug|Win32 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Debug|x86.Build.0 = Debug|Win32 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.ActiveCfg = Release|x64 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x64.Build.0 = Release|x64 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x86.ActiveCfg = Release|Win32 + {40B445E8-306A-4C77-9B19-FC76C2379F79}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {ED69324B-A55F-49DC-91D3-5F1D34DF875C} + EndGlobalSection + GlobalSection(Performance) = preSolution + HasPerformanceSessions = true + EndGlobalSection +EndGlobal diff --git a/build/x86_windows/uavs3d.vcxproj b/build/vs2017/uavs3d.vcxproj similarity index 55% rename from build/x86_windows/uavs3d.vcxproj rename to build/vs2017/uavs3d.vcxproj index 9557243..25603ec 100644 --- a/build/x86_windows/uavs3d.vcxproj +++ b/build/vs2017/uavs3d.vcxproj @@ -1,118 +1,200 @@ - - - - - Debug - x64 - - - Release - x64 - - - - {798F7D68-C94D-41AF-86A4-98F7726D172C} - Win32Proj - dec_test_vs17 - uavs3d - 10.0.17763.0 - - - - Application - true - MultiByte - v141 - - - Application - false - true - MultiByte - v141 - - - - - - - - - - - - - false - $(Platform)\$(Configuration)\$(ProjectName)\ - $(SolutionDir)\..\..\bin - - - false - $(Platform)\$(Configuration)\$(ProjectName)\ - $(SolutionDir)\..\..\bin - - - - - - Level3 - Disabled - WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG - ..\..\inc;..\..\src - $(IntDir)vc$(PlatformToolsetVersion).pdb - MultiThreadedDebug - true - /arch:AVX %(AdditionalOptions) - - - Console - true - - - ..\..\lib - - - - - Level3 - - - MaxSpeed - true - true - WIN64;_CONSOLE;%(PreprocessorDefinitions) - ..\..\inc;..\..\src - $(IntDir)vc$(PlatformToolsetVersion).pdb - ProgramDatabase - MultiThreaded - true - /arch:AVX %(AdditionalOptions) - - - Console - true - true - true - - - ..\..\lib - - - - - - - - - - - {3f9c7116-c287-40d7-865c-d8c89cf4ff31} - - - {40b445e8-306a-4c77-9b19-fc76c2379f79} - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {798F7D68-C94D-41AF-86A4-98F7726D172C} + Win32Proj + dec_test_vs17 + uavs3d + 10.0.16299.0 + + + + Application + true + MultiByte + v141 + + + Application + true + MultiByte + v141 + + + Application + false + true + MultiByte + v141 + + + Application + false + true + MultiByte + v141 + + + + + + + + + + + + + + + + + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)\..\..\bin + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)\..\..\bin + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level3 + Disabled + WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG + ..\..\inc;..\..\src + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreadedDebug + true + /arch:AVX %(AdditionalOptions) + + + Console + true + + + ..\..\lib + + + + + + + Level3 + Disabled + WIN64;_CONSOLE;%(PreprocessorDefinitions);_DEBUG + ..\..\inc;..\..\src + $(IntDir)vc$(PlatformToolsetVersion).pdb + MultiThreadedDebug + true + /arch:AVX %(AdditionalOptions) + + + Console + true + + + ..\..\lib + + + + + Level3 + + + MaxSpeed + true + true + WIN64;_CONSOLE;%(PreprocessorDefinitions) + ..\..\inc;..\..\src + $(IntDir)vc$(PlatformToolsetVersion).pdb + ProgramDatabase + MultiThreaded + true + /arch:AVX %(AdditionalOptions) + + + Console + true + true + true + + + ..\..\lib + + + + + Level3 + + + MaxSpeed + true + true + WIN64;_CONSOLE;%(PreprocessorDefinitions) + ..\..\inc;..\..\src + $(IntDir)vc$(PlatformToolsetVersion).pdb + ProgramDatabase + MultiThreaded + true + /arch:AVX %(AdditionalOptions) + + + Console + true + true + true + + + ..\..\lib + + + + + + + + + + + {3f9c7116-c287-40d7-865c-d8c89cf4ff31} + + + {40b445e8-306a-4c77-9b19-fc76c2379f79} + + + + + \ No newline at end of file diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 4530a63..b52363e 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -1,19 +1,93 @@ set(LIBNAME uavs3d) +# check cpu +if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR + "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64") + if(${CMAKE_SIZEOF_VOID_P} EQUAL 4) + set(UAVS3D_TARGET_CPU "x86") + elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8) + set(UAVS3D_TARGET_CPU "x86_64") + else() + message(FATAL_ERROR + " Unexpected pointer size ${CMAKE_SIZEOF_VOID_P} for ${CMAKE_SYSTEM_PROCESSOR}\n") + endif() +elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR + "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86") + set(UAVS3D_TARGET_CPU "x86") +elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64" OR + "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64") + set(UAVS3D_TARGET_CPU "arm64") +elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm") + set(UAVS3D_TARGET_CPU "armv7") +elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "loongarch64") + set(UAVS3D_TARGET_CPU "loongarch64") +else() + message(WARNING "unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}\n") + set(UAVS3D_TARGET_CPU "generic") +endif() + # add source aux_source_directory(./decoder DIR_UAVS3D_SRC) aux_source_directory(./decore DIR_UAVS3D_CORE) -aux_source_directory(./decore/sse DIR_X86_SRC) -aux_source_directory(./decore/avx2 DIR_X86_256_SRC) list(APPEND DIR_UAVS3D_SRC ${DIR_UAVS3D_CORE}) include_directories("decore") +set(UAVS3D_ASM_FILES "") + +if("${UAVS3D_TARGET_CPU}" MATCHES "x86" OR + "${UAVS3D_TARGET_CPU}" MATCHES "x86_64") + aux_source_directory(./decore/sse DIR_X86_SRC) + aux_source_directory(./decore/avx2 DIR_X86_256_SRC) + set_source_files_properties(${DIR_X86_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -msse4.2") + set_source_files_properties(${DIR_X86_256_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -mavx2") + + list(APPEND UAVS3D_ASM_FILES ${DIR_X86_SRC}) + list(APPEND UAVS3D_ASM_FILES ${DIR_X86_256_SRC}) +elseif("${UAVS3D_TARGET_CPU}" MATCHES "armv7") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/armv7.c") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/alf_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/deblock_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/def_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/inter_pred_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/dct2_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct8_dst7_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/pixel_armv7.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_armv7.c") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_kernel_armv7.S") + + add_definitions(-D _armv7a) + enable_language(ASM) +elseif("${UAVS3D_TARGET_CPU}" MATCHES "arm64") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/arm64.c") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/alf_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/deblock_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/def_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/inter_pred_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/intra_pred_chroma_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_arm64.c") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct2_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/itrans_dct8_dst7_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/pixel_arm64.S") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_arm64.c") + list(APPEND UAVS3D_ASM_FILES "./decore/arm64/sao_kernel_arm64.S") -set_source_files_properties(${DIR_UAVS3D_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3") -set_source_files_properties(${DIR_X86_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3 -msse4.2") -set_source_files_properties(${DIR_X86_256_SRC} PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -O3 -mavx2") + add_definitions(-D _arm64) + enable_language(ASM) +elseif("${UAVS3D_TARGET_CPU}" MATCHES "loongarch64") + # loongarch64 +endif() + +if(COMPILE_10BIT) + add_definitions(-DCOMPILE_10BIT=1) + message("-- compile 10bit") +else() + add_definitions(-DCOMPILE_10BIT=0) + message("-- compile 8bit") +endif() # get version set (CONFIG_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..) @@ -30,6 +104,7 @@ endfunction() extract_version_string("${CONFIG_DIR}/version.h" uavs3d_version) MESSAGE(STATUS "uavs3d version \t\t: ${uavs3d_version}") +MESSAGE(STATUS "Target CPU\t\t\t: ${UAVS3D_TARGET_CPU}") # pkg-config find_package(Threads REQUIRED) set(prefix "${CMAKE_INSTALL_PREFIX}") @@ -65,9 +140,11 @@ MESSAGE(STATUS "BUILD_SHARED_LIBS \t\t: true") else() MESSAGE(STATUS "BUILD_SHARED_LIBS \t\t: false") endif() -add_library(${LIBNAME} ${DIR_UAVS3D_SRC} ${DIR_X86_256_SRC} ${DIR_X86_SRC}) +add_library(${LIBNAME} ${DIR_UAVS3D_SRC} ${UAVS3D_ASM_FILES}) -target_link_libraries(${LIBNAME} m) +if (NOT MSVC) + target_link_libraries(${LIBNAME} m) +endif() if(CMAKE_USE_PTHREADS_INIT) target_link_libraries(${LIBNAME} pthread) endif() @@ -76,4 +153,3 @@ endif() install(TARGETS uavs3d LIBRARY DESTINATION ${CMAKE_INSTALL_LIB_DIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIB_DIR}) install(FILES decoder/uavs3d.h DESTINATION ${CMAKE_INSTALL_INCLUDE_DIR}) install(FILES ${CONFIG_DIR}/${LIBNAME}.pc DESTINATION ${CMAKE_INSTALL_PKGCONFIG_DIR}) - diff --git a/source/decoder/bitstream.c b/source/decoder/bitstream.c index 9c433ad..1c3aaac 100644 --- a/source/decoder/bitstream.c +++ b/source/decoder/bitstream.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decoder/bitstream.h b/source/decoder/bitstream.h index 60052b4..f1b1043 100644 --- a/source/decoder/bitstream.h +++ b/source/decoder/bitstream.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decoder/dec_type.h b/source/decoder/dec_type.h index 1761605..cfb0442 100644 --- a/source/decoder/dec_type.h +++ b/source/decoder/dec_type.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decoder/dec_util.c b/source/decoder/dec_util.c index e9374d9..e30e323 100644 --- a/source/decoder/dec_util.c +++ b/source/decoder/dec_util.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -250,7 +245,7 @@ static void uavs3d_always_inline com_mv_rounding_affine(s32 hor, s32 ver, s32 * } } -static void uavs3d_always_inline check_umve_motion_availability(int scup, int cu_width, int cu_height, int i_scu, int neighbor[NUM_SPATIAL_MV], int valid[NUM_SPATIAL_MV], com_scu_t * map_scu, s16(*map_mv)[REFP_NUM][MV_D], s8(*map_refi)[REFP_NUM]) +static void uavs3d_always_inline check_umve_motion_availability(int scup, int cu_width, int cu_height, int i_scu, int neighbor[5], int valid[5], com_scu_t * map_scu, s16(*map_mv)[REFP_NUM][MV_D], s8(*map_refi)[REFP_NUM]) { int cu_width_in_scu = cu_width >> MIN_CU_LOG2; int cu_height_in_scu = cu_height >> MIN_CU_LOG2; @@ -985,7 +980,6 @@ static int get_affine_merge_candidate(com_core_t *core, s8 mrg_list_refi[REFP_NU int scup = core->cu_scup; com_map_t *map = &core->map; com_seqh_t *seqhdr = core->seqhdr; - com_ref_pic_t(*refp)[REFP_NUM] = core->refp; int i_scu = seqhdr->i_scu; int lidx, i, k; int cu_width = core->cu_width; @@ -1112,6 +1106,7 @@ static int get_affine_merge_candidate(com_core_t *core, s8 mrg_list_refi[REFP_NU int neb_addr_rb = scup + i_scu * (cu_height_in_scu - 1) + (cu_width_in_scu - 1); int scu_y; int scup_co = get_colocal_scup(neb_addr_rb, i_scu, seqhdr->pic_width_in_scu, seqhdr->pic_height_in_scu, &scu_y); + com_ref_pic_t(*refp)[REFP_NUM] = core->refp; if (core->pichdr->slice_type == SLICE_B) { uavs3d_check_ref_avaliable(refp[0][REFP_1].pic, scu_y << MIN_CU_LOG2); diff --git a/source/decoder/dec_util.h b/source/decoder/dec_util.h index 77c00a6..9d46217 100644 --- a/source/decoder/dec_util.h +++ b/source/decoder/dec_util.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decoder/parser.c b/source/decoder/parser.c index 45bb26a..9405daa 100644 --- a/source/decoder/parser.c +++ b/source/decoder/parser.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -88,6 +83,9 @@ int dec_parse_sqh(com_bs_t * bs, com_seqh_t * seqhdr) dec_bs_read1(bs, 1); //marker_bit seqhdr->vertical_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX); + seqhdr->display_horizontal_size = seqhdr->horizontal_size; + seqhdr->display_vertical_size = seqhdr->vertical_size; + seqhdr->chroma_format = (u8)dec_bs_read(bs, 2, 1, 1); seqhdr->sample_precision = (u8)dec_bs_read(bs, 3, 1, 2); @@ -262,19 +260,21 @@ static int user_data(com_pic_header_t *pichdr, com_bs_t * bs) return RET_OK; } -static int sequence_display_extension(com_bs_t * bs) +static int sequence_display_extension(com_bs_t * bs, com_seqh_t *seqhdr) { dec_bs_read(bs, 3, 0, COM_UINT32_MAX); // video_format u(3) dec_bs_read1(bs, -1); // sample_range u(1) - int colour_description = dec_bs_read1(bs, -1); // colour_description u(1) - if (colour_description) { - dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // colour_primaries u(8) - dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // transfer_characteristics u(8) - dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // matrix_coefficients u(8) + + seqhdr->colour_description = dec_bs_read1(bs, -1); // colour_description u(1) + + if (seqhdr->colour_description) { + seqhdr->colour_primaries = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // colour_primaries u(8) + seqhdr->transfer_characteristics = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // transfer_characteristics u(8) + seqhdr->matrix_coefficients = dec_bs_read(bs, 8, 0, COM_UINT32_MAX); // matrix_coefficients u(8) } - dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_horizontal_size u(14) + seqhdr->display_horizontal_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_horizontal_size u(14) dec_bs_read1(bs, 1); //marker_bit - dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_vertical_size u(14) + seqhdr->display_vertical_size = dec_bs_read(bs, 14, 0, COM_UINT32_MAX); // display_vertical_size u(14) char td_mode_flag = dec_bs_read1(bs, -1); // td_mode_flag u(1) if (td_mode_flag == 1) { @@ -460,7 +460,7 @@ static int extension_data(com_bs_t * bs, com_seqh_t *seqhdr, com_pic_header_t *p if (i == 0) { int ret = dec_bs_read(bs, 4, 0, COM_UINT32_MAX); if (ret == 2) { - sequence_display_extension(bs); + sequence_display_extension(bs, seqhdr); } else if (ret == 3) { temporal_scalability_extension(bs); } else if (ret == 4) { @@ -1647,7 +1647,7 @@ static uavs3d_always_inline u32 lbac_read_unary_sym_ep(com_lbac_t * lbac) low = lbac_refill2(lbac, low); } val += bin; - } while (bin); + } while (bin && lbac->cur < lbac->end); lbac->range = range; lbac->low = low; @@ -2834,8 +2834,7 @@ int dec_parse_lcu_delta_qp(com_lbac_t * lbac, int last_dqp) bin = lbac_dec_bin(lbac, ctx->lcu_qp_delta + act_ctx); act_ctx = min(3, act_ctx + 1); act_sym += !bin; - } - while (!bin); + } while (!bin && lbac->cur < lbac->end); } dquant = (act_sym + 1) >> 1; diff --git a/source/decoder/parser.h b/source/decoder/parser.h index 20cd103..93fbb1a 100644 --- a/source/decoder/parser.h +++ b/source/decoder/parser.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decoder/uavs3d.c b/source/decoder/uavs3d.c index 9ac7dc2..7b0491d 100644 --- a/source/decoder/uavs3d.c +++ b/source/decoder/uavs3d.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -1060,7 +1055,7 @@ int __cdecl uavs3d_flush(void *h, uavs3d_io_frm_t* frm_out) void* __cdecl uavs3d_create(uavs3d_cfg_t * dec_cfg, uavs3d_lib_output_callback_t callback, int * err) { - uavs3d_dec_t *ctx = NULL; + uavs3d_dec_t *ctx; printf("libuavs3d(%2d): %s_%s, %s\n", BIT_DEPTH, VERSION_STR, VERSION_TYPE, VERSION_SHA1); diff --git a/source/decoder/uavs3d.h b/source/decoder/uavs3d.h index f5ea2b0..1931917 100644 --- a/source/decoder/uavs3d.h +++ b/source/decoder/uavs3d.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -137,6 +132,9 @@ typedef struct uavs3d_com_seqh_t { unsigned int bbv_buffer_size; /* 18 bits */ int horizontal_size; /* 14 bits */ int vertical_size; /* 14 bits */ + int display_horizontal_size; /* 14 bits */ + int display_vertical_size; /* 14 bits */ + unsigned char log2_max_cu_width_height; /* 3 bits */ unsigned char min_cu_size; unsigned char max_part_ratio_log2; @@ -211,6 +209,13 @@ typedef struct uavs3d_com_seqh_t { /* alf map */ unsigned char *alf_idx_map; + + /* hdr info */ + unsigned char colour_description; + unsigned char colour_primaries; + unsigned char transfer_characteristics; + unsigned char matrix_coefficients; + } com_seqh_t; #define FRAME_MAX_PLANES 3 diff --git a/source/decore/alf.c b/source/decore/alf.c index 89a2411..111d99f 100644 --- a/source/decore/alf.c +++ b/source/decore/alf.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/alf_arm64.S b/source/decore/arm64/alf_arm64.S index 49a620f..9998073 100644 --- a/source/decore/arm64/alf_arm64.S +++ b/source/decore/arm64/alf_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/arm64.c b/source/decore/arm64/arm64.c index 0850349..74c2f81 100644 --- a/source/decore/arm64/arm64.c +++ b/source/decore/arm64/arm64.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -102,7 +97,7 @@ static void uavs3d_padding_rows_chroma_arm64(pel *src, int i_src, int width, int void uavs3d_funs_init_arm64() { -#if (BIT_DEPTH == 8) +#if !COMPILE_10BIT uavs3d_funs_handle.sao[ Y_C] = uavs3d_sao_on_lcu_arm64; uavs3d_funs_handle.sao[UV_C] = uavs3d_sao_on_lcu_chroma_arm64; uavs3d_funs_handle.alf[ Y_C] = uavs3d_alf_one_lcu_arm64; @@ -311,7 +306,7 @@ void uavs3d_funs_init_arm64() uavs3d_funs_handle.itrans_dst7[1] = uavs3d_itrans_dct8_pb8_arm64; uavs3d_funs_handle.itrans_dst7[2] = uavs3d_itrans_dct8_pb16_arm64; - uavs3d_funs_handle.conv_fmt_16bit = uavs3d_conv_fmt_16bit_arm64; + //uavs3d_funs_handle.conv_fmt_16bit = uavs3d_conv_fmt_16bit_arm64; uavs3d_funs_handle.padding_rows_luma = uavs3d_padding_rows_luma_arm64; uavs3d_funs_handle.padding_rows_chroma = uavs3d_padding_rows_chroma_arm64; diff --git a/source/decore/arm64/arm64.h b/source/decore/arm64/arm64.h index 6ba0566..b2410ef 100644 --- a/source/decore/arm64/arm64.h +++ b/source/decore/arm64/arm64.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/deblock_arm64.S b/source/decore/arm64/deblock_arm64.S index ff47274..b152147 100644 --- a/source/decore/arm64/deblock_arm64.S +++ b/source/decore/arm64/deblock_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -1540,12 +1535,12 @@ deblock_ver_filtered: bif v17.16b, v1.16b, v25.16b bif v18.16b, v6.16b, v25.16b - mov v1.2d, v17.2d - mov v2.2d, v28.2d - mov v3.2d, v30.2d - mov v4.2d, v31.2d - mov v5.2d, v29.2d - mov v6.2d, v18.2d + mov v1.16b, v17.16b + mov v2.16b, v28.16b + mov v3.16b, v30.16b + mov v4.16b, v31.16b + mov v5.16b, v29.16b + mov v6.16b, v18.16b st4 {v0.H, v1.H, v2.H, v3.H}[0], [x2], #8 st4 {v4.H, v5.H, v6.H, v7.H}[0], [x2], x5 diff --git a/source/decore/arm64/def_arm64.S b/source/decore/arm64/def_arm64.S index c6a219d..82e267d 100644 --- a/source/decore/arm64/def_arm64.S +++ b/source/decore/arm64/def_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -35,25 +30,23 @@ * For more information, contact us at rgwang@pkusz.edu.cn. **************************************************************************************/ -#if defined(__ANDROID__) -.macro function name - .text - .align 4 - .global \name - .type \name, %function - \name: -.endm - #if defined(__aarch64__) && !defined(__arm64__) #define __arm64__ 1 #endif -#elif defined(__APPLE__) + +#if defined(__APPLE__) .macro function name .text .align 4 .global _\name _\name: .endm +#else +.macro function name + .text + .align 4 + .global \name + .type \name, %function + \name: +.endm #endif - -#define COMPILE_10BIT 0 diff --git a/source/decore/arm64/inter_pred_arm64.S b/source/decore/arm64/inter_pred_arm64.S index e86addd..375f041 100644 --- a/source/decore/arm64/inter_pred_arm64.S +++ b/source/decore/arm64/inter_pred_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -1809,9 +1804,9 @@ if_hor_ver_chroma_w8_loop_y: smlal v30.4s, v22.4h, v7.h[3] smlal2 v31.4s, v22.8h, v7.h[3] - mov v16.2d, v20.2d - mov v17.2d, v21.2d - mov v18.2d, v22.2d + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b rshrn v24.4h, v24.4s, #12 rshrn2 v24.8h, v25.4s, #12 @@ -2553,10 +2548,10 @@ if_hor_ver_luma_w4_loop_y: smlal v29.4s, v21.4h, v8.h[7] smlal2 v30.4s, v21.8h, v8.h[7] - mov v16.2d, v18.2d - mov v17.2d, v19.2d - mov v18.2d, v20.2d - mov v19.2d, v21.2d + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b rshrn v27.4h, v27.4s, #12 rshrn v28.4h, v28.4s, #12 @@ -5268,7 +5263,7 @@ function uavs3d_if_hor_ver_luma_w8_arm64 ldr w8, [sp] // w8 = max_val sub sp, sp, #80 - sub x9, sp, #16 + add x9, sp, #64 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] st1 {v11.2d}, [x9] diff --git a/source/decore/arm64/intra_pred_arm64.S b/source/decore/arm64/intra_pred_arm64.S index e2c4629..5689bd4 100644 --- a/source/decore/arm64/intra_pred_arm64.S +++ b/source/decore/arm64/intra_pred_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -2996,12 +2991,12 @@ intra_pred_bi_ipf_end: // const s8 *flt_coef_hor, const s8 *flt_coef_ver, int w, int h, int bit_depth) //src->x0, dst->x1, i_dst->x2, flt_range_hor->x3, flt_range_ver->x4, flt_coef_hor->x5, flt_coef_ver->x6, w->x7 function uavs3d_intra_pred_ipf_arm64 -#if defined(__ANDROID__) - ldr w8, [sp] // w8 = h - ldr w9, [sp, #8] // w9 = bit_depth -#elif defined(__APPLE__) +#if defined(__APPLE__) ldr w8, [sp] ldr w9, [sp, #4] +#else + ldr w8, [sp] // w8 = h + ldr w9, [sp, #8] // w9 = bit_depth #endif add x0, x0, #1 // p_top = src + 1 diff --git a/source/decore/arm64/intra_pred_chroma_arm64.S b/source/decore/arm64/intra_pred_chroma_arm64.S index 2c142f3..b12d02a 100644 --- a/source/decore/arm64/intra_pred_chroma_arm64.S +++ b/source/decore/arm64/intra_pred_chroma_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -555,7 +550,7 @@ intra_pred_dc_uv_fillblock_w8: b intra_pred_dc_uv_end intra_pred_dc_uv_fillblock_w16: - mov v1.8h, v0.8h + mov v1.16b, v0.16b intra_pred_dc_uv_fillblock_w16_y: st1 {v0.8h, v1.8h}, [x1], x2 // store dst[x] st1 {v0.8h, v1.8h}, [x1], x2 @@ -570,9 +565,9 @@ intra_pred_dc_uv_fillblock_w32x: cmp w3, #64 beq intra_pred_dc_uv_fillblock_w64 - mov v1.8h, v0.8h - mov v2.8h, v0.8h - mov v3.8h, v0.8h + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b intra_pred_dc_uv_fillblock_w32_y: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // store dst[x] st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 @@ -585,9 +580,9 @@ intra_pred_dc_uv_fillblock_w32_y: intra_pred_dc_uv_fillblock_w64: sub x2, x2, #64 - mov v1.8h, v0.8h - mov v2.8h, v0.8h - mov v3.8h, v0.8h + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b intra_pred_dc_uv_fillblock_w64_y: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 diff --git a/source/decore/arm64/itrans_arm64.c b/source/decore/arm64/itrans_arm64.c index d26ec5f..9b2362e 100644 --- a/source/decore/arm64/itrans_arm64.c +++ b/source/decore/arm64/itrans_arm64.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/itrans_arm64.h b/source/decore/arm64/itrans_arm64.h index c4977d0..15d9a4b 100644 --- a/source/decore/arm64/itrans_arm64.h +++ b/source/decore/arm64/itrans_arm64.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/itrans_dct2_arm64.S b/source/decore/arm64/itrans_dct2_arm64.S index 00b99e1..0f37c7e 100644 --- a/source/decore/arm64/itrans_dct2_arm64.S +++ b/source/decore/arm64/itrans_dct2_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -758,9 +753,9 @@ dct2_h16_1st_loopx: dct2_h16_2nd_transform: sub sp, sp, #48 - sub x7, sp, #16 - st1 {v10.8h, v11.8h}, [sp] - st1 {v12.8h}, [x7] + add x7, sp, #16 + st1 {v10.8h, v11.8h}, [x7] + st1 {v12.8h}, [sp] mov w8, #1 lsl w8, w8, w5 @@ -1059,8 +1054,8 @@ dct2_h16_2nd_loopx: cmp x8, x3 blt dct2_h16_2nd_loopx - ld1 {v10.8h, v11.8h}, [sp], #32 ld1 {v12.8h}, [sp], #16 + ld1 {v10.8h, v11.8h}, [sp], #32 ld1 {v8.8h, v9.8h}, [sp], #32 dct2_h16_end: diff --git a/source/decore/arm64/itrans_dct8_dst7_arm64.S b/source/decore/arm64/itrans_dct8_dst7_arm64.S index 340865c..5ba39e5 100644 --- a/source/decore/arm64/itrans_dct8_dst7_arm64.S +++ b/source/decore/arm64/itrans_dct8_dst7_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/pixel_arm64.S b/source/decore/arm64/pixel_arm64.S index bc79826..81fa81c 100644 --- a/source/decore/arm64/pixel_arm64.S +++ b/source/decore/arm64/pixel_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -285,8 +280,8 @@ padding_rows_lr_y: ld1r {v0.8h}, [x0] ld1r {v2.8h}, [x5] // right reference pointer - mov v1.8h, v0.8h - mov v3.8h, v2.8h + mov v1.16b, v0.16b + mov v3.16b, v2.16b sub x5, x0, x4 add x6, x0, x2 @@ -332,8 +327,8 @@ padding_rows_chroma_lr_y: ld1r {v0.4s}, [x0] // src[0] src[1] ld1r {v2.4s}, [x5] // right reference pointer - mov v1.4s, v0.4s - mov v3.4s, v2.4s + mov v1.16b, v0.16b + mov v3.16b, v2.16b sub x5, x0, x4 add x6, x0, x2 diff --git a/source/decore/arm64/sao_arm64.c b/source/decore/arm64/sao_arm64.c index dc1a5b0..37cf9e0 100644 --- a/source/decore/arm64/sao_arm64.c +++ b/source/decore/arm64/sao_arm64.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/arm64/sao_kernel_arm64.S b/source/decore/arm64/sao_kernel_arm64.S index 1546b17..b0d0bb8 100644 --- a/source/decore/arm64/sao_kernel_arm64.S +++ b/source/decore/arm64/sao_kernel_arm64.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -451,10 +446,10 @@ loop_x_eo_90_chroma_end: ************************************************************************************************************************************/ function uavs3d_sao_eo_135_arm64 // get start_x_r0 and end_x_r0 -#if defined(__ANDROID__) - ldp x8, x9, [sp] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] #endif sxtw x8, w8 // start_x_r0 sxtw x9, w9 // end_x_r0 @@ -537,10 +532,10 @@ test_loop_x_eo_135_end_r0: // ------- middle rows ------- // get param -#if defined(__ANDROID__) - ldp x7, x8, [sp, #16] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7, w8, [sp, #8] +#else + ldp x7, x8, [sp, #16] #endif sxtw x7, w7 // start_x_r sxtw x8, w8 // end_x_r @@ -613,10 +608,10 @@ test_loop_x_eo_135_end_r: bgt test_loop_y_eo_135_r // ------- last row ------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] +#else + ldp x6, x7, [sp, #32] #endif sxtw x6, w6 // start_x_rn sxtw x7, w7 // end_x_rn @@ -691,10 +686,10 @@ test_loop_x_eo_135_end_rn: ************************************************************************************************************************************/ function uavs3d_sao_eo_135_chroma_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] #endif sxtw x8, w8 // start_x_r0 sxtw x9, w9 // end_x_r0 @@ -793,10 +788,10 @@ loop_x_eo_135_chroma_end_r0: add x1, x1, x3 //-- dst+=dst_stride //--------------------------------middle rows-------------------------------- -#if defined(__ANDROID__) - ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r +#else + ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r #endif sxtw x7 , w7 sxtw x8 , w8 @@ -886,10 +881,10 @@ loop_x_eo_135_chroma_end_r: bgt loop_y_eo_135_chroma_r //---------------------------------last row-------------------------------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn +#else + ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn #endif sxtw x7, w7 sxtw x6, w6 @@ -979,10 +974,10 @@ loop_x_eo_135_chroma_end_rn: * end_x_r0->x8, start_x_r->x9, end_x_r->x10, start_x_rn->x11, end_x_rn->x12 ************************************************************************************************************************************/ function uavs3d_sao_eo_45_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] #endif sxtw x8, w8 // start_x_r0 sxtw x9, w9 // end_x_r0 @@ -1064,10 +1059,10 @@ test_loop_x_eo_45_end_r0: // ------- middle rows ------- // get param -#if defined(__ANDROID__) - ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7, w8, [sp, #8] // x7 start_x_r; x8 end_x_r +#else + ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r #endif sxtw x7, w7 sxtw x8, w8 @@ -1141,10 +1136,10 @@ test_loop_x_eo_45_end_r: bgt test_loop_y_eo_45_r // ------- last row ------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] // $x6 start_x_rn; $x7 end_x_rn -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] // $x6 start_x_rn; $x7 end_x_rn +#else + ldp x6, x7, [sp, #32] // $x6 start_x_rn; $x7 end_x_rn #endif sxtw x6, w6 sxtw x7, w7 @@ -1217,10 +1212,10 @@ ret * end_x_r0->x8, start_x_r->x9, end_x_r->x10, start_x_rn->x11, end_x_rn->x12 ************************************************************************************************************************************/ function uavs3d_sao_eo_45_chroma_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] #endif sxtw x8, w8 // start_x_r0 sxtw x9, w9 // end_x_r0 @@ -1315,10 +1310,10 @@ loop_x_eo_45_chroma_end_r0: add x1, x1, x3 //-- dst+=dst_stride //--------------------------------middle rows-------------------------------- -#if defined(__ANDROID__) - ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r +#else + ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r #endif sxtw x7 , w7 sxtw x8 , w8 @@ -1405,10 +1400,10 @@ loop_x_eo_45_chroma_end_r: bgt loop_y_eo_45_chroma_r //---------------------------------last row-------------------------------- -#if defined(__ANDROID__) - ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6 , w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn +#else + ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn #endif sxtw x7 , w7 sxtw x6 , w6 @@ -2221,10 +2216,10 @@ loop_x_eo_90_chroma_end: * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7 ************************************************************************************************************************************/ function uavs3d_sao_eo_135_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] // start_x_r0 and end_x_r0 -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] // start_x_r0 and end_x_r0 #endif sxtw x8, w8 // start_x_r0 @@ -2336,10 +2331,10 @@ test_loop_x_eo_135_end_r0: // ------- middle rows ------- // get param -#if defined(__ANDROID__) - ldp x7, x8, [sp, #16] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7, w8, [sp, #8] +#else + ldp x7, x8, [sp, #16] #endif sxtw x7, w7 // x7 start_x_r sxtw x8, w8 // x8 end_x_r @@ -2431,10 +2426,10 @@ test_loop_x_eo_135_end_r: bgt test_loop_y_eo_135_r // ------- last row ------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] +#else + ldp x6, x7, [sp, #32] #endif sxtw x6, w6 // start_x_rn sxtw x7, w7 // end_x_rn @@ -2527,10 +2522,10 @@ test_loop_x_eo_135_end_rn: * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7 ************************************************************************************************************************************/ function uavs3d_sao_eo_135_chroma_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] // start_x_r0 and end_x_r0 -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] // start_x_r0 and end_x_r0 #endif mov w13, #1 @@ -2636,10 +2631,10 @@ loop_x_eo_135_chroma_end_r0: add x1, x1, x3 //-- dst+=dst_stride //--------------------------------middle rows-------------------------------- -#if defined(__ANDROID__) - ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r +#else + ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r #endif sxtw x7 , w7 sxtw x8 , w8 @@ -2730,10 +2725,10 @@ loop_x_eo_135_chroma_end_r: bgt loop_y_eo_135_chroma_r //---------------------------------last row-------------------------------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn +#else + ldp x6, x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn #endif sxtw x7, w7 sxtw x6, w6 @@ -2822,10 +2817,10 @@ loop_x_eo_135_chroma_end_rn: * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7 ************************************************************************************************************************************/ function uavs3d_sao_eo_45_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] // start_x_r0 and end_x_r0 -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] // start_x_r0 and end_x_r0 #endif mov w12, #1 @@ -2936,10 +2931,10 @@ test_loop_x_eo_45_end_r0: // ------- middle rows ------- // get param -#if defined(__ANDROID__) - ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7, w8, [sp, #8] // x7 start_x_r; x8 end_x_r +#else + ldp x7, x8, [sp, #16] // x7 start_x_r; x8 end_x_r #endif sxtw x7, w7 sxtw x8, w8 @@ -3031,10 +3026,10 @@ test_loop_x_eo_45_end_r: bgt test_loop_y_eo_45_r // ------- last row ------- -#if defined(__ANDROID__) - ldp x6, x7, [sp, #32] -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6, w7, [sp, #16] +#else + ldp x6, x7, [sp, #32] #endif sxtw x6, w6 // start_x_rn sxtw x7, w7 // end_x_rn @@ -3126,10 +3121,10 @@ test_loop_x_eo_45_end_rn: * src->x0, dst->x1, src_stride->x2, dst_stride->x3, offset->x4, mask->x5, mb_height->x6, bit_depth->x7 ************************************************************************************************************************************/ function uavs3d_sao_eo_45_chroma_arm64 -#if defined(__ANDROID__) - ldp x8, x9, [sp] // start_x_r0 and end_x_r0 -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w8, w9, [sp] +#else + ldp x8, x9, [sp] // start_x_r0 and end_x_r0 #endif mov w12, #1 @@ -3236,10 +3231,10 @@ loop_x_eo_45_chroma_end_r0: add x1, x1, x3 //-- dst+=dst_stride //--------------------------------middle rows-------------------------------- -#if defined(__ANDROID__) - ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w7 , w8, [sp, #8] //-- x7=start_x_r; x8=end_x_r +#else + ldp x7 , x8, [sp, #16] //-- x7=start_x_r; x8=end_x_r #endif sxtw x7 , w7 sxtw x8 , w8 @@ -3328,10 +3323,10 @@ loop_x_eo_45_chroma_end_r: bgt loop_y_eo_45_chroma_r //---------------------------------last row-------------------------------- -#if defined(__ANDROID__) - ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn -#elif defined(__APPLE__) +#if defined(__APPLE__) ldp w6 , w7, [sp, #16] //-- x6=start_x_rn; x7=end_x_rn +#else + ldp x6 , x7, [sp, #32] //-- x6=start_x_rn; x7=end_x_rn #endif sxtw x7 , w7 sxtw x6 , w6 diff --git a/source/decore/arm64/sao_kernel_arm64.h b/source/decore/arm64/sao_kernel_arm64.h index 03373ce..b9c448e 100644 --- a/source/decore/arm64/sao_kernel_arm64.h +++ b/source/decore/arm64/sao_kernel_arm64.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/alf_armv7.S b/source/decore/armv7/alf_armv7.S index 5b0e3e0..c87f29b 100644 --- a/source/decore/armv7/alf_armv7.S +++ b/source/decore/armv7/alf_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/armv7.c b/source/decore/armv7/armv7.c index 0be03a4..6ad9086 100644 --- a/source/decore/armv7/armv7.c +++ b/source/decore/armv7/armv7.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -242,7 +237,7 @@ void uavs3d_itrans_dct2_h64_w64_armv7(s16 *src, s16 *dst, int bit_depth) void uavs3d_funs_init_armv7() { -#if BIT_DEPTH == 8 +#if !COMPILE_10BIT uavs3d_funs_handle.sao[ Y_C] = uavs3d_sao_on_lcu_armv7; uavs3d_funs_handle.sao[UV_C] = uavs3d_sao_on_lcu_chroma_armv7; uavs3d_funs_handle.alf[ Y_C] = uavs3d_alf_one_lcu_armv7; diff --git a/source/decore/armv7/armv7.h b/source/decore/armv7/armv7.h index 60efd7e..0114927 100644 --- a/source/decore/armv7/armv7.h +++ b/source/decore/armv7/armv7.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/dct2_armv7.S b/source/decore/armv7/dct2_armv7.S index 82c2f82..05e965f 100644 --- a/source/decore/armv7/dct2_armv7.S +++ b/source/decore/armv7/dct2_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/deblock_armv7.S b/source/decore/armv7/deblock_armv7.S index 202d121..dcdeabf 100644 --- a/source/decore/armv7/deblock_armv7.S +++ b/source/decore/armv7/deblock_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/def_armv7.S b/source/decore/armv7/def_armv7.S index fdc3d27..861f016 100644 --- a/source/decore/armv7/def_armv7.S +++ b/source/decore/armv7/def_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -35,23 +30,18 @@ * For more information, contact us at rgwang@pkusz.edu.cn. **************************************************************************************/ -#if defined(__ANDROID__) -.macro function name - .global \name - .hidden \name - .type \name, %function -\name: -.endm -#elif defined(__APPLE__) +#if defined(__APPLE__) .macro function name .text .align 4 .global _\name _\name: .endm - +#else +.macro function name + .global \name + .hidden \name + .type \name, %function +\name: +.endm #endif - -#define COMPILE_10BIT 0 - - diff --git a/source/decore/armv7/inter_pred_armv7.S b/source/decore/armv7/inter_pred_armv7.S index edbcc62..fd78ae4 100644 --- a/source/decore/armv7/inter_pred_armv7.S +++ b/source/decore/armv7/inter_pred_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/intra_pred_armv7.S b/source/decore/armv7/intra_pred_armv7.S index cbce583..807f6a5 100644 --- a/source/decore/armv7/intra_pred_armv7.S +++ b/source/decore/armv7/intra_pred_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/itrans_dct8_dst7_armv7.S b/source/decore/armv7/itrans_dct8_dst7_armv7.S index 129bbc3..d8d1ab1 100755 --- a/source/decore/armv7/itrans_dct8_dst7_armv7.S +++ b/source/decore/armv7/itrans_dct8_dst7_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/pixel_armv7.S b/source/decore/armv7/pixel_armv7.S index 83124a4..f88f474 100644 --- a/source/decore/armv7/pixel_armv7.S +++ b/source/decore/armv7/pixel_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/sao_armv7.c b/source/decore/armv7/sao_armv7.c index cec6880..8b3070e 100644 --- a/source/decore/armv7/sao_armv7.c +++ b/source/decore/armv7/sao_armv7.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/sao_kernel_armv7.S b/source/decore/armv7/sao_kernel_armv7.S index 0449ecc..1652c99 100644 --- a/source/decore/armv7/sao_kernel_armv7.S +++ b/source/decore/armv7/sao_kernel_armv7.S @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/armv7/sao_kernel_armv7.h b/source/decore/armv7/sao_kernel_armv7.h index d30c690..49b020b 100644 --- a/source/decore/armv7/sao_kernel_armv7.h +++ b/source/decore/armv7/sao_kernel_armv7.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/avx2/alf_avx2.c b/source/decore/avx2/alf_avx2.c index 74226b6..cc2cf5f 100644 --- a/source/decore/avx2/alf_avx2.c +++ b/source/decore/avx2/alf_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -489,7 +484,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src __m256i T000, T001, T010, T011, T100, T101, T110, T111, T200, T201, T210, T211, T310, T311, T300, T301, T400, T401, T410, T411, T500, T501, T510, T511, T600, T601, T610, T611, T700, T701, T710, T711; __m256i E00, E01, E10, E11; __m256i C0, C1, C2, C3, C4, C5, C6, C7, C8; - __m256i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S7, S8, S60, S61, S70, S71, S80, S81, S82, S83, SS1, SS2, SS3, SS4; + __m256i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S7, S60, S61, S70, S71, S80, S81, S82, S83, SS1, SS2, SS3, SS4; __m256i mAddOffset; __m256i mZero = _mm256_set1_epi16(0); __m256i mMax = _mm256_set1_epi16((short)((1 << sample_bit_depth) - 1)); @@ -629,7 +624,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S50, 1)); S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S60, 1)); S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S70, 1)); - S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1)); + //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1)); S0 = _mm256_add_epi32(S0, S1); S2 = _mm256_add_epi32(S2, S3); S4 = _mm256_add_epi32(S4, S5); @@ -803,7 +798,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S50, 1)); S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S60, 1)); S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S70, 1)); - S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1)); + //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S80, 1)); S0 = _mm256_add_epi32(S0, S1); S2 = _mm256_add_epi32(S2, S3); S4 = _mm256_add_epi32(S4, S5); @@ -821,7 +816,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src S5 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S51)); S6 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S61)); S7 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S71)); - S8 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S81)); + //S8 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(S81)); S0 = _mm256_add_epi32(S0, S1); S2 = _mm256_add_epi32(S2, S3); S4 = _mm256_add_epi32(S4, S5); @@ -839,7 +834,7 @@ void uavs3d_alf_one_lcu_one_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src S5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S51, 1)); S6 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S61, 1)); S7 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S71, 1)); - S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S81, 1)); + //S8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(S81, 1)); S0 = _mm256_add_epi32(S0, S1); S2 = _mm256_add_epi32(S2, S3); S4 = _mm256_add_epi32(S4, S5); @@ -1055,7 +1050,6 @@ void uavs3d_alf_one_lcu_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src, in int startPos = 0; int endPos = lcu_height; int xPosEnd = lcu_width << 1; - long long tmp[8]; src += (startPos*i_src); dst += (startPos*i_dst); @@ -1066,22 +1060,14 @@ void uavs3d_alf_one_lcu_chroma_avx2(pel *dst, int i_dst, pel *src, int i_src, in T00 = _mm256_unpacklo_epi16(C8, C8); T01 = _mm256_unpackhi_epi16(C8, C8); - tmp[0] = _mm256_extract_epi64(T00, 0); // win32 compile error if C0 = _mm256_set1_epi64x(_mm256_extract_epi64(T00, 0)); - tmp[1] = _mm256_extract_epi64(T00, 1); - tmp[2] = _mm256_extract_epi64(T00, 2); - tmp[3] = _mm256_extract_epi64(T00, 3); - tmp[4] = _mm256_extract_epi64(T01, 0); - tmp[5] = _mm256_extract_epi64(T01, 1); - tmp[6] = _mm256_extract_epi64(T01, 2); - tmp[7] = _mm256_extract_epi64(T01, 3); - C0 = _mm256_set1_epi64x(tmp[0]); - C1 = _mm256_set1_epi64x(tmp[1]); - C2 = _mm256_set1_epi64x(tmp[2]); - C3 = _mm256_set1_epi64x(tmp[3]); - C4 = _mm256_set1_epi64x(tmp[4]); - C5 = _mm256_set1_epi64x(tmp[5]); - C6 = _mm256_set1_epi64x(tmp[6]); - C7 = _mm256_set1_epi64x(tmp[7]); + C0 = _mm256_permute4x64_epi64(T00, 0x00); + C1 = _mm256_permute4x64_epi64(T00, 0x55); + C2 = _mm256_permute4x64_epi64(T00, 0xaa); + C3 = _mm256_permute4x64_epi64(T00, 0xff); + C4 = _mm256_permute4x64_epi64(T01, 0x00); + C5 = _mm256_permute4x64_epi64(T01, 0x55); + C6 = _mm256_permute4x64_epi64(T01, 0xaa); + C7 = _mm256_permute4x64_epi64(T01, 0xff); C8 = _mm256_set1_epi32((unsigned short)coef[16] + (((unsigned short)coef[17]) << 16)); C8 = _mm256_unpacklo_epi16(C8, C8); diff --git a/source/decore/avx2/avx2.c b/source/decore/avx2/avx2.c index a17dac5..d4b3ebf 100644 --- a/source/decore/avx2/avx2.c +++ b/source/decore/avx2/avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -183,11 +178,13 @@ void uavs3d_funs_init_avx2() uavs3d_funs_handle.ipcpy[4] = uavs3d_if_cpy_w64_avx2; uavs3d_funs_handle.ipcpy[5] = uavs3d_if_cpy_w128_avx2; + uavs3d_funs_handle.ipflt[IPFILTER_H_4][1] = uavs3d_if_hor_chroma_w8_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_4][2] = uavs3d_if_hor_chroma_w16_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_4][3] = uavs3d_if_hor_chroma_w16x_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_4][4] = uavs3d_if_hor_chroma_w16x_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_4][5] = uavs3d_if_hor_chroma_w16x_avx2; + uavs3d_funs_handle.ipflt[IPFILTER_H_8][1] = uavs3d_if_hor_luma_w8_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_8][2] = uavs3d_if_hor_luma_w16_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_8][3] = uavs3d_if_hor_luma_w16x_avx2; uavs3d_funs_handle.ipflt[IPFILTER_H_8][4] = uavs3d_if_hor_luma_w16x_avx2; @@ -198,6 +195,7 @@ void uavs3d_funs_init_avx2() uavs3d_funs_handle.ipflt[IPFILTER_V_4][4] = uavs3d_if_ver_chroma_w32x_avx2; uavs3d_funs_handle.ipflt[IPFILTER_V_4][5] = uavs3d_if_ver_chroma_w32x_avx2; + uavs3d_funs_handle.ipflt[IPFILTER_V_8][1] = uavs3d_if_ver_luma_w8_avx2; uavs3d_funs_handle.ipflt[IPFILTER_V_8][2] = uavs3d_if_ver_luma_w16_avx2; uavs3d_funs_handle.ipflt[IPFILTER_V_8][3] = uavs3d_if_ver_luma_w16x_avx2; uavs3d_funs_handle.ipflt[IPFILTER_V_8][4] = uavs3d_if_ver_luma_w16x_avx2; @@ -209,6 +207,8 @@ void uavs3d_funs_init_avx2() uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_4][4] = uavs3d_if_hor_ver_chroma_w16x_avx2; uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_4][5] = uavs3d_if_hor_ver_chroma_w16x_avx2; + uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][0] = uavs3d_if_hor_ver_luma_w4_avx2; + uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][1] = uavs3d_if_hor_ver_luma_w8_avx2; uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][2] = uavs3d_if_hor_ver_luma_w16x_avx2; uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][3] = uavs3d_if_hor_ver_luma_w16x_avx2; uavs3d_funs_handle.ipflt_ext[IPFILTER_EXT_8][4] = uavs3d_if_hor_ver_luma_w16x_avx2; diff --git a/source/decore/avx2/avx2.h b/source/decore/avx2/avx2.h index 43d0ad3..a29cfe8 100644 --- a/source/decore/avx2/avx2.h +++ b/source/decore/avx2/avx2.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/avx2/inter_pred_avx2.c b/source/decore/avx2/inter_pred_avx2.c index c7287d3..cbaa2fc 100644 --- a/source/decore/avx2/inter_pred_avx2.c +++ b/source/decore/avx2/inter_pred_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** -* Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", +* Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. -* 3. All advertising materials mentioning features or use of this software -* must display the following acknowledgement: -* This product includes the software uAVS3d developed by -* Peking University Shenzhen Graduate School, Peng Cheng Laboratory -* and Guangdong Bohua UHD Innovation Corporation. -* 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, +* 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -33,7 +28,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * For more information, contact us at rgwang@pkusz.edu.cn. -**************************************************************************************/ +**************************************************************************************/ #include "avx2.h" @@ -134,9 +129,11 @@ void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst __m256i mCoefy1_hor = _mm256_set1_epi16(*(s16*)coeff); __m256i mCoefy2_hor = _mm256_set1_epi16(*(s16*)(coeff + 2)); - __m256i mSwitch = _mm256_setr_epi8(0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9); + __m256i mSwitch0 = _mm256_setr_epi8(0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9); + __m256i mSwitch1 = _mm256_setr_epi8(0+4, 2+4, 1+4, 3+4, 2+4, 4+4, 3+4, 5+4, 4+4, 6+4, 5+4, 7+4, 6+4, 8+4, 7+4, 9+4, + 0+4, 2+4, 1+4, 3+4, 2+4, 4+4, 3+4, 5+4, 4+4, 6+4, 5+4, 7+4, 6+4, 8+4, 7+4, 9+4); __m256i mAddOffset = _mm256_set1_epi16(offset); - __m256i T0, T1, S0, S1, R0, R1, sum; + __m256i T0, T1, S0, R0, R1, sum; __m128i s0, s1; src -= 2; @@ -144,27 +141,27 @@ void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst while (height > 0) { s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S0 = _mm256_set_m128i(s1, s0); - S1 = _mm256_srli_si256(S0, 4); - R0 = _mm256_shuffle_epi8(S0, mSwitch); // 4 rows s0 and s1 - R1 = _mm256_shuffle_epi8(S1, mSwitch); + R0 = _mm256_shuffle_epi8(S0, mSwitch0); // 4 rows s0 and s1 + R1 = _mm256_shuffle_epi8(S0, mSwitch1); T0 = _mm256_maddubs_epi16(R0, mCoefy1_hor); // 4x4: s0*c0 + s1*c1 T1 = _mm256_maddubs_epi16(R1, mCoefy2_hor); sum = _mm256_add_epi16(T0, T1); - sum = _mm256_add_epi16(sum, mAddOffset); + sum = _mm256_add_epi16(sum, mAddOffset); sum = _mm256_srai_epi16(sum, shift); s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - s1 = _mm_srli_si128(s0, 8); _mm_storel_epi64((__m128i*)(dst), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst), s1); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0)); height -= 2; - src += i_src << 1; dst += i_dst << 1; } } @@ -184,9 +181,11 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= 2; while (height) { - uavs3d_prefetch(src + i_src*2, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S2 = _mm256_permute4x64_epi64(S0, 0x94); S3 = _mm256_permute4x64_epi64(S1, 0x94); R0 = _mm256_shuffle_epi8(S2, mSwitch1); @@ -200,6 +199,8 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds sum0 = _mm256_add_epi16(T0, T1); sum1 = _mm256_add_epi16(T2, T3); + height -= 2; + sum0 = _mm256_add_epi16(sum0, mAddOffset); sum1 = _mm256_add_epi16(sum1, mAddOffset); sum0 = _mm256_srai_epi16(sum0, shift); @@ -207,8 +208,6 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1))); _mm_storeu_si128((__m128i*)(dst + i_dst), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1))); - height -= 2; - src += i_src << 1; dst += i_dst << 1; } } @@ -228,10 +227,10 @@ void uavs3d_if_hor_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= 2; while (height--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 16)); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + S2 = _mm256_permute4x64_epi64(S0, 0x94); S3 = _mm256_permute4x64_epi64(S1, 0x94); R0 = _mm256_shuffle_epi8(S2, mSwitch1); @@ -308,59 +307,52 @@ void uavs3d_if_hor_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_dst, const int offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); - __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12, 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12); - __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14, 2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14); - __m256i T0, T1, T2, T3, S0, S1, S2, S3, sum; + __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); + __m256i T0, T1, T2, T3, S0, S1, sum; __m256i r0, r1, r2, r3; __m128i s0, s1, s2, s3; - __m256i mCoefy1_hor = _mm256_set1_epi16(*(s16*)coeff); - __m256i mCoefy2_hor = _mm256_set1_epi16(*(s16*)(coeff + 2)); - __m256i mCoefy3_hor = _mm256_set1_epi16(*(s16*)(coeff + 4)); - __m256i mCoefy4_hor = _mm256_set1_epi16(*(s16*)(coeff + 6)); + __m256i mCoefy1_hor = _mm256_set1_epi32(*(s32*)coeff); + __m256i mCoefy2_hor = _mm256_set1_epi32(*(s32*)(coeff + 4)); src -= 3; while (height > 0) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2)); s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3)); + src += i_src << 2; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S0 = _mm256_set_m128i(s2, s0); S1 = _mm256_set_m128i(s3, s1); - S2 = _mm256_srli_si256(S0, 4); - S3 = _mm256_srli_si256(S1, 4); - - T0 = _mm256_unpacklo_epi64(S0, S1); - T1 = _mm256_unpacklo_epi64(S2, S3); - - r0 = _mm256_shuffle_epi8(T0, mSwitch1); - r1 = _mm256_shuffle_epi8(T0, mSwitch2); - r2 = _mm256_shuffle_epi8(T1, mSwitch1); - r3 = _mm256_shuffle_epi8(T1, mSwitch2); + r0 = _mm256_shuffle_epi8(S0, mSwitch1); + r1 = _mm256_shuffle_epi8(S0, mSwitch2); + r2 = _mm256_shuffle_epi8(S1, mSwitch1); + r3 = _mm256_shuffle_epi8(S1, mSwitch2); T0 = _mm256_maddubs_epi16(r0, mCoefy1_hor); T1 = _mm256_maddubs_epi16(r1, mCoefy2_hor); - T2 = _mm256_maddubs_epi16(r2, mCoefy3_hor); - T3 = _mm256_maddubs_epi16(r3, mCoefy4_hor); + T2 = _mm256_maddubs_epi16(r2, mCoefy1_hor); + T3 = _mm256_maddubs_epi16(r3, mCoefy2_hor); T0 = _mm256_add_epi16(T0, T1); T1 = _mm256_add_epi16(T2, T3); - sum = _mm256_add_epi16(T0, T1); + sum = _mm256_hadd_epi16(T0, T1); sum = _mm256_add_epi16(sum, mAddOffset); sum = _mm256_srai_epi16(sum, shift); s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + height -= 4; M32(dst) = _mm_extract_epi32(s0, 0); M32(dst + i_dst) = _mm_extract_epi32(s0, 1); M32(dst + i_dst * 2) = _mm_extract_epi32(s0, 2); M32(dst + i_dst * 3) = _mm_extract_epi32(s0, 3); - height -= 4; - src += i_src << 2; dst += i_dst << 2; } } @@ -385,9 +377,11 @@ void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, src -= 3; while (height) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S = _mm256_set_m128i(s1, s0); r0 = _mm256_shuffle_epi8(S, mSwitch1); @@ -407,13 +401,11 @@ void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, sum = _mm256_add_epi16(sum, mAddOffset); sum = _mm256_srai_epi16(sum, shift); + height -= 2; s0 = _mm_packus_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - s1 = _mm_srli_si128(s0, 8); _mm_storel_epi64((__m128i*)(dst), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst), s1); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0)); - height -= 2; - src += i_src << 1; dst += i_dst << 1; } } @@ -437,11 +429,13 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, src -= 3; while (height) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_permute4x64_epi64(S0, 0x94); S3 = _mm256_permute4x64_epi64(S1, 0x94); + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); r0 = _mm256_shuffle_epi8(S2, mSwitch1); r1 = _mm256_shuffle_epi8(S2, mSwitch2); @@ -475,11 +469,10 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, sum0 = _mm256_srai_epi16(sum0, shift); sum1 = _mm256_srai_epi16(sum1, shift); + height -= 2; _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1))); _mm_storeu_si128((__m128i*)(dst + i_dst), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1))); - height -= 2; - src += i_src << 1; dst += i_dst << 1; } } @@ -503,13 +496,14 @@ void uavs3d_if_hor_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, src -= 3; while (height--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 16)); S2 = _mm256_permute4x64_epi64(S0, 0x94); S3 = _mm256_permute4x64_epi64(S1, 0x94); + src += i_src; + uavs3d_prefetch(src, _MM_HINT_NTA); + r0 = _mm256_shuffle_epi8(S2, mSwitch1); r1 = _mm256_shuffle_epi8(S2, mSwitch2); r2 = _mm256_shuffle_epi8(S2, mSwitch3); @@ -545,7 +539,6 @@ void uavs3d_if_hor_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, _mm_storeu_si128((__m128i*)(dst), _mm_packus_epi16(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1))); _mm_storeu_si128((__m128i*)(dst + 16), _mm_packus_epi16(_mm256_castsi256_si128(sum1), _mm256_extracti128_si256(sum1, 1))); - src += i_src; dst += i_dst; } } @@ -635,15 +628,17 @@ void uavs3d_if_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst src -= i_src; while (height) { - uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA); - height -= 2; s0 = _mm_loadl_epi64((__m128i*)(src)); s1 = _mm_loadl_epi64((__m128i*)(src + i_src)); s2 = _mm_loadl_epi64((__m128i*)(src + i_src2)); s3 = _mm_loadl_epi64((__m128i*)(src + i_src3)); s4 = _mm_loadl_epi64((__m128i*)(src + i_src4)); + src += 2 * i_src; + height -= 2; + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src4, _MM_HINT_NTA); + S0 = _mm256_set_m128i(s1, s0); S1 = _mm256_set_m128i(s2, s1); S2 = _mm256_set_m128i(s3, s2); @@ -660,12 +655,10 @@ void uavs3d_if_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst mVal = _mm256_add_epi16(mVal, mAddOffset); mVal = _mm256_srai_epi16(mVal, shift); s0 = _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1)); - s1 = _mm_srli_si128(s0, 8); _mm_storel_epi64((__m128i*)(dst), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst), s1); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0)); - src += 2 * i_src; dst += 2 * i_dst; } } @@ -687,15 +680,17 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= i_src; while (height) { - uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA); - height -= 2; s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); s2 = _mm_loadu_si128((__m128i*)(src + i_src2)); s3 = _mm_loadu_si128((__m128i*)(src + i_src3)); s4 = _mm_loadu_si128((__m128i*)(src + i_src4)); + src += 2 * i_src; + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src4, _MM_HINT_NTA); + height -= 2; + S0 = _mm256_set_m128i(s1, s0); S1 = _mm256_set_m128i(s2, s1); S2 = _mm256_set_m128i(s3, s2); @@ -723,7 +718,6 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(mVal0)); _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(mVal0, 1)); - src += 2 * i_src; dst += 2 * i_dst; } } @@ -744,15 +738,17 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= i_src; while (height) { - uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA); - height -= 2; S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); + src += 2 * i_src; + height -= 2; + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src4, _MM_HINT_NTA); + T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpackhi_epi8(S0, S1); T2 = _mm256_unpacklo_epi8(S2, S3); @@ -790,7 +786,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds _mm256_storeu_si256((__m256i*)dst, mVal0); _mm256_storeu_si256((__m256i*)(dst + i_dst), mVal2); - src += 2 * i_src; dst += 2 * i_dst; } @@ -811,7 +806,6 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= i_src; while (height--){ - uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S4 = _mm256_loadu_si256((__m256i*)(src + 32)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); @@ -821,6 +815,7 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); S7 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 32)); + src += i_src; T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpacklo_epi8(S2, S3); T2 = _mm256_unpackhi_epi8(S0, S1); @@ -830,6 +825,8 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds T6 = _mm256_unpackhi_epi8(S4, S5); T7 = _mm256_unpackhi_epi8(S6, S7); + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + T0 = _mm256_maddubs_epi16(T0, coeff0); T1 = _mm256_maddubs_epi16(T1, coeff1); T2 = _mm256_maddubs_epi16(T2, coeff0); @@ -858,7 +855,6 @@ void uavs3d_if_ver_chroma_w64_avx2(const pel *src, int i_src, pel *dst, int i_ds _mm256_storeu_si256((__m256i*)(dst), mVal0); _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); - src += i_src; dst += i_dst; } } @@ -878,7 +874,6 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d src -= i_src; while (height--) { - uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S4 = _mm256_loadu_si256((__m256i*)(src + 32)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); @@ -934,6 +929,9 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d S3 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 64)); S7 = _mm256_loadu_si256((__m256i*)(src + i_src3 + 96)); + src += i_src; + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpacklo_epi8(S2, S3); T2 = _mm256_unpackhi_epi8(S0, S1); @@ -971,7 +969,6 @@ void uavs3d_if_ver_chroma_w128_avx2(const pel *src, int i_src, pel *dst, int i_d _mm256_storeu_si256((__m256i*)(dst + 64), mVal0); _mm256_storeu_si256((__m256i*)(dst + 96), mVal1); - src += i_src; dst += i_dst; } } @@ -1078,7 +1075,6 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, while (height) { __m128i S0, S1, S2, S3, S4, S5, S6, S7, S8; - height -= 2; S0 = _mm_loadl_epi64((__m128i*)(src)); S1 = _mm_loadl_epi64((__m128i*)(src + i_src)); S2 = _mm_loadl_epi64((__m128i*)(src + i_src2)); @@ -1098,6 +1094,11 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, R6 = _mm256_set_m128i(S7, S6); R7 = _mm256_set_m128i(S8, S7); + src += 2 * i_src; + height -= 2; + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src8, _MM_HINT_NTA); + T0 = _mm256_unpacklo_epi8(R0, R1); T1 = _mm256_unpacklo_epi8(R2, R3); T2 = _mm256_unpacklo_epi8(R4, R5); @@ -1115,11 +1116,9 @@ void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, mVal = _mm256_add_epi16(mVal, mAddOffset); mVal = _mm256_srai_epi16(mVal, shift); S0 = _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1)); - S1 = _mm_srli_si128(S0, 8); _mm_storel_epi64((__m128i*)(dst), S0); - _mm_storel_epi64((__m128i*)(dst + i_dst), S1); - src += 2 * i_src; + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(S0)); dst += 2 * i_dst; } } @@ -1147,10 +1146,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, while(height) { __m128i S0, S1, S2, S3, S4, S5, S6, S7, S8; - uavs3d_prefetch(src + 9 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 10 * i_src, _MM_HINT_NTA); - - height -= 2; S0 = _mm_loadu_si128((__m128i*)(src)); S1 = _mm_loadu_si128((__m128i*)(src + i_src)); S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); @@ -1170,6 +1165,12 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, R6 = _mm256_set_m128i(S6, S7); R7 = _mm256_set_m128i(S7, S8); + src += 2 * i_src; + height -= 2; + + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src8, _MM_HINT_NTA); + T0 = _mm256_unpacklo_epi8(R0, R1); T1 = _mm256_unpackhi_epi8(R0, R1); T2 = _mm256_unpacklo_epi8(R2, R3); @@ -1203,7 +1204,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, _mm_storeu_si128((__m128i*)dst, _mm256_extractf128_si256(mVal1, 1)); _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_castsi256_si128(mVal1)); - src += 2 * i_src; dst += 2 * i_dst; } } @@ -1230,7 +1230,6 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, src -= 3 * i_src; while (height--) { __m256i S0, S1, S2, S3, S4, S5, S6, S7; - uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); @@ -1240,6 +1239,7 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); + src += i_src; T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpacklo_epi8(S2, S3); T2 = _mm256_unpacklo_epi8(S4, S5); @@ -1249,6 +1249,8 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, T6 = _mm256_unpackhi_epi8(S4, S5); T7 = _mm256_unpackhi_epi8(S6, S7); + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); + T0 = _mm256_maddubs_epi16(T0, coeff0); T1 = _mm256_maddubs_epi16(T1, coeff1); T2 = _mm256_maddubs_epi16(T2, coeff2); @@ -1273,7 +1275,6 @@ void uavs3d_if_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_dst, _mm256_storeu_si256((__m256i*)(dst), mVal1); - src += i_src; dst += i_dst; } } @@ -1295,12 +1296,11 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst, __m256i coeff3 = _mm256_set1_epi16(*(s16*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; - src -= 3 * i_src; + src -= i_src3; while (height--) { const pel *p = src + 32; __m256i S0, S1, S2, S3, S4, S5, S6, S7; - uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); @@ -1352,6 +1352,7 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst, S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); + src += i_src; T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpacklo_epi8(S2, S3); T2 = _mm256_unpacklo_epi8(S4, S5); @@ -1361,6 +1362,8 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst, T6 = _mm256_unpackhi_epi8(S4, S5); T7 = _mm256_unpackhi_epi8(S6, S7); + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); + T0 = _mm256_maddubs_epi16(T0, coeff0); T1 = _mm256_maddubs_epi16(T1, coeff1); T2 = _mm256_maddubs_epi16(T2, coeff2); @@ -1385,7 +1388,6 @@ void uavs3d_if_ver_luma_w64_avx2(const pel *src, int i_src, pel *dst, int i_dst, _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); - src += i_src; dst += i_dst; } } @@ -1412,7 +1414,6 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst while (height--) { const pel *p = src + 32; __m256i S0, S1, S2, S3, S4, S5, S6, S7; - uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); @@ -1552,6 +1553,8 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); + src += i_src; + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); T0 = _mm256_unpacklo_epi8(S0, S1); T1 = _mm256_unpacklo_epi8(S2, S3); T2 = _mm256_unpacklo_epi8(S4, S5); @@ -1585,7 +1588,6 @@ void uavs3d_if_ver_luma_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst _mm256_storeu_si256((__m256i*)(dst + 96), mVal1); - src += i_src; dst += i_dst; } } @@ -1668,6 +1670,8 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i S2 = _mm256_permute2x128_si256(mVal[1], mVal[2], 0x21); S3 = mVal[2]; + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + T0 = _mm256_unpacklo_epi16(S0, S1); T1 = _mm256_unpacklo_epi16(S2, S3); T2 = _mm256_unpackhi_epi16(S0, S1); @@ -1709,17 +1713,15 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i T0 = _mm256_srai_epi32(T0, shift); T2 = _mm256_srai_epi32(T2, shift); - s0 = _mm_packus_epi16(_mm256_castsi256_si128(R0), _mm256_extracti128_si256(R0, 1)); - s1 = _mm_srli_si128(s0, 8); + s2 = _mm_packus_epi16(_mm256_castsi256_si128(R0), _mm256_extracti128_si256(R0, 1)); T0 = _mm256_packs_epi32(T0, T2); - s2 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1)); - s3 = _mm_srli_si128(s2, 8); + s3 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1)); - _mm_storel_epi64((__m128i*)(dst), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst), s1); - _mm_storel_epi64((__m128i*)(dst + i_dst*2), s2); - _mm_storel_epi64((__m128i*)(dst + i_dst*3), s3); + _mm_storel_epi64((__m128i*)(dst), s2); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s2)); + _mm_storel_epi64((__m128i*)(dst + i_dst*2), s3); + _mm_storeh_pi((__m64*)(dst + i_dst*3), _mm_castsi128_ps(s3)); dst += i_dst << 2; height -= 4; @@ -1756,17 +1758,17 @@ void uavs3d_if_hor_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int row = height + 3; while (row--) { - uavs3d_prefetch(src + i_src*2, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); + src += i_src; S1 = _mm256_permute4x64_epi64(S0, 0x94); + uavs3d_prefetch(src, _MM_HINT_NTA); R0 = _mm256_shuffle_epi8(S1, mSwitch1); R1 = _mm256_shuffle_epi8(S1, mSwitch2); T0 = _mm256_maddubs_epi16(R0, mCoefy1_hor); T1 = _mm256_maddubs_epi16(R1, mCoefy2_hor); sum = _mm256_add_epi16(T0, T1); - _mm256_storeu_si256((__m256i*)(tmp), sum); - src += i_src; + _mm256_store_si256((__m256i*)(tmp), sum); tmp += i_tmp; } @@ -1878,8 +1880,8 @@ void uavs3d_if_hor_ver_chroma_w32x_avx2(const pel *src, int i_src, pel *dst, int sum0 = _mm256_add_epi16(T0, T1); sum1 = _mm256_add_epi16(T2, T3); - _mm256_storeu_si256((__m256i*)(tmp + col), sum0); - _mm256_storeu_si256((__m256i*)(tmp + col + 16), sum1); + _mm256_store_si256((__m256i*)(tmp + col), sum0); + _mm256_store_si256((__m256i*)(tmp + col + 16), sum1); } src += i_src; tmp += i_tmp; @@ -2055,8 +2057,8 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); - s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2)); - s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3)); + s2 = _mm_loadu_si128((__m128i*)(src + i_src2)); + s3 = _mm_loadu_si128((__m128i*)(src + i_src3)); S0 = _mm256_set_m128i(s2, s0); S1 = _mm256_set_m128i(s3, s1); @@ -2090,8 +2092,8 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d // hor s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); - s2 = _mm_loadu_si128((__m128i*)(src + i_src * 2)); - s3 = _mm_loadu_si128((__m128i*)(src + i_src * 3)); + s2 = _mm_loadu_si128((__m128i*)(src + i_src2)); + s3 = _mm_loadu_si128((__m128i*)(src + i_src3)); S0 = _mm256_set_m128i(s2, s0); S1 = _mm256_set_m128i(s3, s1); @@ -2173,19 +2175,7 @@ void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_d void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) { - const int i_tmp = 8; - const int i_tmp2 = 16; - const int i_tmp3 = 24; - const int i_tmp4 = 32; - const int i_tmp5 = 40; - const int i_tmp6 = 48; - const int i_tmp7 = 56;; - const int i_tmp8 = 64; - const int i_tmp9 = 72;; - const int i_tmp10 = 80; const int i_src2 = i_src << 1; - int row; - int shift = 12; __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9; __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; @@ -2201,12 +2191,16 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d //HOR { + int row; src = src - 3 * i_src - 3; // first row { __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src)); T0 = _mm256_set_m128i(mSrc0, mSrc0); + src += i_src; + + uavs3d_prefetch(src, _MM_HINT_NTA); r0 = _mm256_shuffle_epi8(T0, mSwitch1); r1 = _mm256_shuffle_epi8(T0, mSwitch2); @@ -2224,13 +2218,16 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d mVal[0] = _mm256_permute4x64_epi64(mVal[0], 0x44); - src += i_src; } for (row = 1; row < 4; row++) { - __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src)); + __m128i mSrc0 = _mm_loadu_si128((__m128i*)(src)); __m128i mSrc1 = _mm_loadu_si128((__m128i*)(src + i_src)); T0 = _mm256_set_m128i(mSrc1, mSrc0); + src += i_src2; + + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); r0 = _mm256_shuffle_epi8(T0, mSwitch1); r1 = _mm256_shuffle_epi8(T0, mSwitch2); @@ -2245,8 +2242,6 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d T0 = _mm256_add_epi16(T0, T1); T1 = _mm256_add_epi16(T2, T3); mVal[row] = _mm256_add_epi16(T0, T1); - - src += i_src2; } } @@ -2260,35 +2255,44 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d __m256i mCoefy2 = _mm256_cvtepi8_epi16(mCoefy22); __m256i mCoefy3 = _mm256_cvtepi8_epi16(mCoefy33); __m256i mCoefy4 = _mm256_cvtepi8_epi16(mCoefy44); + const int shift = 12; while (height > 0) { __m128i s0, s1; //hor - s0 = _mm_loadu_si128((__m128i*)(src)); + s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); T0 = _mm256_set_m128i(s1, s0); + src += i_src2; + + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + r0 = _mm256_shuffle_epi8(T0, mSwitch1); r1 = _mm256_shuffle_epi8(T0, mSwitch2); r2 = _mm256_shuffle_epi8(T0, mSwitch3); r3 = _mm256_shuffle_epi8(T0, mSwitch4); - src += i_src2; - T0 = _mm256_maddubs_epi16(r0, mCoefy1_hor); T1 = _mm256_maddubs_epi16(r1, mCoefy2_hor); T2 = _mm256_maddubs_epi16(r2, mCoefy3_hor); T3 = _mm256_maddubs_epi16(r3, mCoefy4_hor); - s0 = _mm_loadu_si128((__m128i*)(src)); + s0 = _mm_loadu_si128((__m128i*)(src)); s1 = _mm_loadu_si128((__m128i*)(src + i_src)); T0 = _mm256_add_epi16(T0, T1); T1 = _mm256_add_epi16(T2, T3); mVal[4] = _mm256_add_epi16(T0, T1); + src += i_src2; + T0 = _mm256_set_m128i(s1, s0); + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + r0 = _mm256_shuffle_epi8(T0, mSwitch1); r1 = _mm256_shuffle_epi8(T0, mSwitch2); r2 = _mm256_shuffle_epi8(T0, mSwitch3); @@ -2303,8 +2307,6 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d T1 = _mm256_add_epi16(T2, T3); mVal[5] = _mm256_add_epi16(T0, T1); - src += i_src2; - T0 = _mm256_permute2x128_si256(mVal[0], mVal[1], 0x21); T1 = mVal[1]; T2 = _mm256_permute2x128_si256(mVal[1], mVal[2], 0x21); @@ -2352,10 +2354,9 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d T0 = _mm256_packs_epi32(T0, T4); s0 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1)); - s1 = _mm_srli_si128(s0, 8); _mm_storel_epi64((__m128i*)(dst), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst), s1); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(s0)); r4 = _mm256_unpacklo_epi16(T8, T9); r9 = _mm256_unpackhi_epi16(T8, T9); @@ -2382,13 +2383,12 @@ void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_d T0 = _mm256_packs_epi32(T0, T4); s0 = _mm_packus_epi16(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1)); - s1 = _mm_srli_si128(s0, 8); + height -= 4; _mm_storel_epi64((__m128i*)(dst + i_dst * 2), s0); - _mm_storel_epi64((__m128i*)(dst + i_dst * 3), s1); + _mm_storeh_pi((__m64*)(dst + i_dst * 3), _mm_castsi128_ps(s0)); dst += i_dst << 2; - height -= 4; } } } @@ -2397,7 +2397,6 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ { ALIGNED_32(s16 tmp_res[(128 + 7) * 16]); s16 *tmp = tmp_res; - int row; __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9; __m256i S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10; @@ -2406,6 +2405,7 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ //HOR { + int row; __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); @@ -2421,8 +2421,9 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ row = height + 7; while (row--) { S = _mm256_loadu_si256((__m256i*)(src)); - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + src += i_src; S0 = _mm256_permute4x64_epi64(S, 0x94); + uavs3d_prefetch(src, _MM_HINT_NTA); r0 = _mm256_shuffle_epi8(S0, mSwitch1); r1 = _mm256_shuffle_epi8(S0, mSwitch2); @@ -2438,9 +2439,8 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ T1 = _mm256_add_epi16(T2, T3); sum = _mm256_add_epi16(T0, T1); - _mm256_storeu_si256((__m256i*)(tmp), sum); + _mm256_store_si256((__m256i*)(tmp), sum); - src += i_src; tmp += 16; } } @@ -2603,9 +2603,9 @@ void uavs3d_if_hor_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ mVal = _mm256_packs_epi32(mVal1, mVal2); _mm_storeu_si128((__m128i*)(dst + 3 * i_dst), _mm_packus_epi16(_mm256_castsi256_si128(mVal), _mm256_extracti128_si256(mVal, 1))); + height -= 4; tmp += 4 * i_tmp; dst += 4 * i_dst; - height -= 4; } } } @@ -2614,10 +2614,10 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ { ALIGNED_32(s16 tmp_res[(128 + 7) * 32]); s16 *tmp = tmp_res; - int row, col; const int i_tmp = 32; //HOR { + int row; __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); @@ -2629,26 +2629,25 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ __m256i mCoefy4_hor = _mm256_set1_epi16(*(s16*)(coef_x + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7; - __m256i S0, S1, S2, S3; + __m256i S0, S1; src = src - 3 * i_src - 3; row = height + 7; while (row--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 8)); - S2 = _mm256_insertf128_si256(S0, _mm256_castsi256_si128(S1), 0x1); - S3 = _mm256_insertf128_si256(S1, _mm256_extracti128_si256(S0, 1), 0x0); - - T0 = _mm256_shuffle_epi8(S2, mSwitch1); - T1 = _mm256_shuffle_epi8(S2, mSwitch2); - T2 = _mm256_shuffle_epi8(S2, mSwitch3); - T3 = _mm256_shuffle_epi8(S2, mSwitch4); - T4 = _mm256_shuffle_epi8(S3, mSwitch1); - T5 = _mm256_shuffle_epi8(S3, mSwitch2); - T6 = _mm256_shuffle_epi8(S3, mSwitch3); - T7 = _mm256_shuffle_epi8(S3, mSwitch4); + src += i_src; + uavs3d_prefetch(src, _MM_HINT_NTA); + + T0 = _mm256_shuffle_epi8(S0, mSwitch1); + T1 = _mm256_shuffle_epi8(S0, mSwitch2); + T2 = _mm256_shuffle_epi8(S0, mSwitch3); + T3 = _mm256_shuffle_epi8(S0, mSwitch4); + T4 = _mm256_shuffle_epi8(S1, mSwitch1); + T5 = _mm256_shuffle_epi8(S1, mSwitch2); + T6 = _mm256_shuffle_epi8(S1, mSwitch3); + T7 = _mm256_shuffle_epi8(S1, mSwitch4); T0 = _mm256_maddubs_epi16(T0, mCoefy1_hor); T1 = _mm256_maddubs_epi16(T1, mCoefy2_hor); @@ -2666,10 +2665,11 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ T0 = _mm256_add_epi16(T0, T2); T4 = _mm256_add_epi16(T4, T6); - _mm256_storeu_si256((__m256i*)(tmp), T0); - _mm256_storeu_si256((__m256i*)(tmp + 16), T4); + T1 = _mm256_permute2x128_si256(T0, T4, 0x20); + T3 = _mm256_permute2x128_si256(T0, T4, 0x31); + _mm256_store_si256((__m256i*)(tmp), T1); + _mm256_store_si256((__m256i*)(tmp + 16), T3); - src += i_src; tmp += i_tmp; } } @@ -2685,8 +2685,8 @@ void uavs3d_if_hor_ver_luma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ const int i_tmp8 = 32 * 8; const int i_tmp9 = 32 * 9; const int i_tmp10 = 32 * 10; - - int shift = 12; + int col; + const int shift = 12; __m256i mAddOffset = _mm256_set1_epi32(1 << 11); __m128i mCoefy11 = _mm_set1_epi16(*(s16*)coef_y); __m128i mCoefy22 = _mm_set1_epi16(*(s16*)(coef_y + 2)); @@ -2850,7 +2850,6 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i { ALIGNED_32(s16 tmp_res[(128 + 7) * 128]); s16 *tmp = tmp_res; - int row, col; const int i_tmp = width; __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9; @@ -2859,6 +2858,7 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i //HOR { + int row, col; __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); @@ -2878,17 +2878,15 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i { S0 = _mm256_loadu_si256((__m256i*)(src + col)); S1 = _mm256_loadu_si256((__m256i*)(src + col + 8)); - S2 = _mm256_insertf128_si256(S0, _mm256_castsi256_si128(S1), 0x1); - S3 = _mm256_insertf128_si256(S1, _mm256_extracti128_si256(S0, 1), 0x0); - - T0 = _mm256_shuffle_epi8(S2, mSwitch1); - T1 = _mm256_shuffle_epi8(S2, mSwitch2); - T2 = _mm256_shuffle_epi8(S2, mSwitch3); - T3 = _mm256_shuffle_epi8(S2, mSwitch4); - T4 = _mm256_shuffle_epi8(S3, mSwitch1); - T5 = _mm256_shuffle_epi8(S3, mSwitch2); - T6 = _mm256_shuffle_epi8(S3, mSwitch3); - T7 = _mm256_shuffle_epi8(S3, mSwitch4); + + T0 = _mm256_shuffle_epi8(S0, mSwitch1); + T1 = _mm256_shuffle_epi8(S0, mSwitch2); + T2 = _mm256_shuffle_epi8(S0, mSwitch3); + T3 = _mm256_shuffle_epi8(S0, mSwitch4); + T4 = _mm256_shuffle_epi8(S1, mSwitch1); + T5 = _mm256_shuffle_epi8(S1, mSwitch2); + T6 = _mm256_shuffle_epi8(S1, mSwitch3); + T7 = _mm256_shuffle_epi8(S1, mSwitch4); T0 = _mm256_maddubs_epi16(T0, mCoefy1_hor); T1 = _mm256_maddubs_epi16(T1, mCoefy2_hor); @@ -2906,8 +2904,10 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i T0 = _mm256_add_epi16(T0, T2); T4 = _mm256_add_epi16(T4, T6); - _mm256_storeu_si256((__m256i*)(tmp + col), T0); - _mm256_storeu_si256((__m256i*)(tmp + col + 16), T4); + T1 = _mm256_permute2x128_si256(T0, T4, 0x20); + T3 = _mm256_permute2x128_si256(T0, T4, 0x31); + _mm256_store_si256((__m256i*)(tmp + col), T1); + _mm256_store_si256((__m256i*)(tmp + col + 16), T3); } src += i_src; tmp += i_tmp; @@ -2927,6 +2927,7 @@ void uavs3d_if_hor_ver_luma_w32x_avx2(const pel *src, int i_src, pel *dst, int i const int i_tmp10 = i_tmp5 << 1; int shift = 12; + int col; __m256i mAddOffset = _mm256_set1_epi32(1 << 11); __m128i mCoefy11 = _mm_set1_epi16(*(s16*)coef_y); __m128i mCoefy22 = _mm_set1_epi16(*(s16*)(coef_y + 2)); @@ -3186,45 +3187,121 @@ void uavs3d_if_cpy_w128_avx2(const pel *src, int i_src, pel *dst, int i_dst, int } } +void uavs3d_if_hor_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) +{ + __m256i max_pel = _mm256_set1_epi16((pel)max_val); + __m256i T0, T1, T2, T3, T4, T5; + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i S0, S1, S2; + __m256i offset = _mm256_set1_epi32(32); + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0])); + __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1])); + __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2])); + __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3])); + __m128i s0, s1; + + src -= 3; + + while (height) { + T0 = _mm256_loadu_si256((__m256i*)(src)); + s0 = _mm_loadu_si128((__m128i*)(src + 4)); + T1 = _mm256_loadu_si256((__m256i*)(src + i_src)); + s1 = _mm_loadu_si128((__m128i*)(src + i_src + 4)); + height -= 2; + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + + S0 = _mm256_permute2x128_si256(T0, T1, 0x20); + S2 = _mm256_permute2x128_si256(T0, T1, 0x31); + S1 = _mm256_set_m128i(s1, s0); + + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); + T4 = _mm256_shuffle_epi8(S2, mShuffle0); + T5 = _mm256_shuffle_epi8(S2, mShuffle1); + + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + M4 = _mm256_madd_epi16(T2, mCoef0); + M5 = _mm256_madd_epi16(T3, mCoef1); + M6 = _mm256_madd_epi16(T4, mCoef2); + M7 = _mm256_madd_epi16(T5, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + + M2 = _mm256_add_epi32(M0, offset); + M3 = _mm256_add_epi32(M1, offset); + M2 = _mm256_srai_epi32(M2, 6); + M3 = _mm256_srai_epi32(M3, 6); + M2 = _mm256_packus_epi32(M2, M3); + M2 = _mm256_min_epu16(M2, max_pel); + + _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(M2)); + _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(M2, 1)); + + dst += i_dst << 1; + } +} + void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) { __m256i max_pel = _mm256_set1_epi16((pel)max_val); - __m256i T0, T1, T2, T3, T4, T5, T6, T7; + __m256i T0, T1, T2, T3, T4, T5; __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i S0, S1, S2; __m256i offset = _mm256_set1_epi32(32); - s32* coef = (s32*)coeff; - __m128i mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]); - __m256i mCoef = _mm256_cvtepi8_epi16(mCoef0); + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0])); + __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1])); + __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2])); + __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3])); src -= 3; while (height--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - T0 = _mm256_loadu_si256((__m256i*)(src + 0)); - T1 = _mm256_loadu_si256((__m256i*)(src + 1)); - T2 = _mm256_loadu_si256((__m256i*)(src + 2)); - T3 = _mm256_loadu_si256((__m256i*)(src + 3)); - T4 = _mm256_loadu_si256((__m256i*)(src + 4)); - T5 = _mm256_loadu_si256((__m256i*)(src + 5)); - T6 = _mm256_loadu_si256((__m256i*)(src + 6)); - T7 = _mm256_loadu_si256((__m256i*)(src + 7)); - - M0 = _mm256_madd_epi16(T0, mCoef); - M1 = _mm256_madd_epi16(T1, mCoef); - M2 = _mm256_madd_epi16(T2, mCoef); - M3 = _mm256_madd_epi16(T3, mCoef); - M4 = _mm256_madd_epi16(T4, mCoef); - M5 = _mm256_madd_epi16(T5, mCoef); - M6 = _mm256_madd_epi16(T6, mCoef); - M7 = _mm256_madd_epi16(T7, mCoef); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); - M2 = _mm256_hadd_epi32(M4, M5); - M3 = _mm256_hadd_epi32(M6, M7); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); + S0 = _mm256_lddqu_si256((__m256i*)(src)); + S1 = _mm256_loadu_si256((__m256i*)(src + 4)); + S2 = _mm256_loadu_si256((__m256i*)(src + 8)); + + src += i_src; + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); + T4 = _mm256_shuffle_epi8(S2, mShuffle0); + T5 = _mm256_shuffle_epi8(S2, mShuffle1); + uavs3d_prefetch(src, _MM_HINT_NTA); + + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + M4 = _mm256_madd_epi16(T2, mCoef0); + M5 = _mm256_madd_epi16(T3, mCoef1); + M6 = _mm256_madd_epi16(T4, mCoef2); + M7 = _mm256_madd_epi16(T5, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); M2 = _mm256_add_epi32(M0, offset); M3 = _mm256_add_epi32(M1, offset); @@ -3235,7 +3312,6 @@ void uavs3d_if_hor_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, _mm256_storeu_si256((__m256i*)(dst), M2); - src += i_src; dst += i_dst; } } @@ -3244,12 +3320,16 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst { int col; __m256i max_pel = _mm256_set1_epi16((pel)max_val); - __m256i T0, T1, T2, T3, T4, T5, T6, T7; + __m256i T0, T1, T2, T3, T4, T5; __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i S0, S1, S2; __m256i offset = _mm256_set1_epi32(32); - s32 *coef = (s32*)coeff; - __m128i mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]); - __m256i mCoef = _mm256_cvtepi8_epi16(mCoef0); + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + __m256i mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[0])); + __m256i mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[1])); + __m256i mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[2])); + __m256i mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coeff)[3])); src -= 3; @@ -3258,31 +3338,33 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst uavs3d_prefetch(src + i_src, _MM_HINT_NTA); for (col = 0; col < width; col += 16) { - T0 = _mm256_loadu_si256((__m256i*)(p_src + 0)); - T1 = _mm256_loadu_si256((__m256i*)(p_src + 1)); - T2 = _mm256_loadu_si256((__m256i*)(p_src + 2)); - T3 = _mm256_loadu_si256((__m256i*)(p_src + 3)); - T4 = _mm256_loadu_si256((__m256i*)(p_src + 4)); - T5 = _mm256_loadu_si256((__m256i*)(p_src + 5)); - T6 = _mm256_loadu_si256((__m256i*)(p_src + 6)); - T7 = _mm256_loadu_si256((__m256i*)(p_src + 7)); - - M0 = _mm256_madd_epi16(T0, mCoef); - M1 = _mm256_madd_epi16(T1, mCoef); - M2 = _mm256_madd_epi16(T2, mCoef); - M3 = _mm256_madd_epi16(T3, mCoef); - M4 = _mm256_madd_epi16(T4, mCoef); - M5 = _mm256_madd_epi16(T5, mCoef); - M6 = _mm256_madd_epi16(T6, mCoef); - M7 = _mm256_madd_epi16(T7, mCoef); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); - M2 = _mm256_hadd_epi32(M4, M5); - M3 = _mm256_hadd_epi32(M6, M7); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); + S0 = _mm256_loadu_si256((__m256i*)(p_src)); + S1 = _mm256_loadu_si256((__m256i*)(p_src + 4)); + S2 = _mm256_loadu_si256((__m256i*)(p_src + 8)); + + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); + T4 = _mm256_shuffle_epi8(S2, mShuffle0); + T5 = _mm256_shuffle_epi8(S2, mShuffle1); + + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + M4 = _mm256_madd_epi16(T2, mCoef0); + M5 = _mm256_madd_epi16(T3, mCoef1); + M6 = _mm256_madd_epi16(T4, mCoef2); + M7 = _mm256_madd_epi16(T5, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); M2 = _mm256_add_epi32(M0, offset); M3 = _mm256_add_epi32(M1, offset); @@ -3299,6 +3381,47 @@ void uavs3d_if_hor_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst } } +void uavs3d_if_hor_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) +{ + const int offset = 32; + const int shift = 6; + + __m128i coef0 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coeff)[0])); + __m128i coef1 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coeff)[1])); + __m256i mCoef0 = _mm256_set_m128i(coef1, coef0); + __m256i mCoef1 = _mm256_set_m128i(coef0, coef1); + __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11); + __m256i mAddOffset = _mm256_set1_epi32((s16)offset); + __m256i T0, T1, S0, S1; + __m256i max_pel = _mm256_set1_epi16((pel)max_val); + __m128i s0; + + src -= 2; + + while (height--) { + uavs3d_prefetch(src + i_src * 2, _MM_HINT_NTA); + S0 = _mm256_loadu_si256((__m256i*)(src)); + s0 = _mm_loadu_si128((__m128i*)(src + 4)); + src += i_src; + S1 = _mm256_set_m128i(s0, s0); + uavs3d_prefetch(src, _MM_HINT_NTA); + T0 = _mm256_shuffle_epi8(S0, mSwitch); + T1 = _mm256_shuffle_epi8(S1, mSwitch); + T0 = _mm256_madd_epi16(T0, mCoef0); + T1 = _mm256_madd_epi16(T1, mCoef1); + T0 = _mm256_add_epi32(T0, T1); + + T0 = _mm256_add_epi32(T0, mAddOffset); + T0 = _mm256_srai_epi32(T0, shift); + T0 = _mm256_min_epu16(T0, max_pel); + s0 = _mm_packus_epi32(_mm256_castsi256_si128(T0), _mm256_extracti128_si256(T0, 1)); + + _mm_storeu_si128((__m128i*)(dst), s0); + + dst += i_dst; + } +} + void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) { const int offset = 32; @@ -3310,7 +3433,7 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1); __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11); __m256i mAddOffset = _mm256_set1_epi32((s16)offset); - __m256i T0, T1, T2, T3, S0, S1, S2, S3; + __m256i T0, T1, T2, T3, S0, S1, S2; __m256i max_pel = _mm256_set1_epi16((pel)max_val); src -= 2; @@ -3320,15 +3443,14 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 4)); S2 = _mm256_loadu_si256((__m256i*)(src + 8)); - S3 = _mm256_loadu_si256((__m256i*)(src + 12)); - S0 = _mm256_permute4x64_epi64(S0, 0x94); - S1 = _mm256_permute4x64_epi64(S1, 0x94); - S2 = _mm256_permute4x64_epi64(S2, 0x94); - S3 = _mm256_permute4x64_epi64(S3, 0x94); - T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0); - T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1); - T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0); - T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1); + T0 = _mm256_shuffle_epi8(S0, mSwitch); + T1 = _mm256_shuffle_epi8(S1, mSwitch); + T2 = _mm256_shuffle_epi8(S1, mSwitch); + T3 = _mm256_shuffle_epi8(S2, mSwitch); + T0 = _mm256_madd_epi16(T0, mCoef0); + T1 = _mm256_madd_epi16(T1, mCoef1); + T2 = _mm256_madd_epi16(T2, mCoef0); + T3 = _mm256_madd_epi16(T3, mCoef1); T0 = _mm256_add_epi32(T0, T1); T2 = _mm256_add_epi32(T2, T3); @@ -3337,7 +3459,6 @@ void uavs3d_if_hor_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds T0 = _mm256_srai_epi32(T0, shift); T2 = _mm256_srai_epi32(T2, shift); T0 = _mm256_packus_epi32(T0, T2); - T0 = _mm256_permute4x64_epi64(T0, 0xd8); T0 = _mm256_min_epu16(T0, max_pel); _mm256_storeu_si256((__m256i*)(dst), T0); @@ -3359,7 +3480,7 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1); __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11); __m256i mAddOffset = _mm256_set1_epi32((s16)offset); - __m256i T0, T1, T2, T3, S0, S1, S2, S3; + __m256i T0, T1, T2, T3, S0, S1, S2; __m256i max_pel = _mm256_set1_epi16((pel)max_val); src -= 2; @@ -3370,15 +3491,14 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d S0 = _mm256_loadu_si256((__m256i*)(src + col)); S1 = _mm256_loadu_si256((__m256i*)(src + col + 4)); S2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); - S3 = _mm256_loadu_si256((__m256i*)(src + col + 12)); - S0 = _mm256_permute4x64_epi64(S0, 0x94); - S1 = _mm256_permute4x64_epi64(S1, 0x94); - S2 = _mm256_permute4x64_epi64(S2, 0x94); - S3 = _mm256_permute4x64_epi64(S3, 0x94); - T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0); - T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1); - T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0); - T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1); + T0 = _mm256_shuffle_epi8(S0, mSwitch); + T1 = _mm256_shuffle_epi8(S1, mSwitch); + T2 = _mm256_shuffle_epi8(S1, mSwitch); + T3 = _mm256_shuffle_epi8(S2, mSwitch); + T0 = _mm256_madd_epi16(T0, mCoef0); + T1 = _mm256_madd_epi16(T1, mCoef1); + T2 = _mm256_madd_epi16(T2, mCoef0); + T3 = _mm256_madd_epi16(T3, mCoef1); T0 = _mm256_add_epi32(T0, T1); T2 = _mm256_add_epi32(T2, T3); @@ -3387,9 +3507,8 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d T0 = _mm256_srai_epi32(T0, shift); T2 = _mm256_srai_epi32(T2, shift); T0 = _mm256_packus_epi32(T0, T2); - T0 = _mm256_permute4x64_epi64(T0, 0xd8); - T0 = _mm256_min_epu16(T0, max_pel); + _mm256_storeu_si256((__m256i*)(dst + col), T0); } src += i_src; @@ -3397,6 +3516,93 @@ void uavs3d_if_hor_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_d } } +void uavs3d_if_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) +{ + const int i_src2 = i_src * 2; + const int i_src3 = i_src * 3; + const int i_src4 = i_src * 4; + const int i_src5 = i_src * 5; + const int i_src6 = i_src * 6; + const int i_src7 = i_src * 7; + __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff); + __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2)); + __m128i coeff2 = _mm_set1_epi16(*(s16*)(coeff + 4)); + __m128i coeff3 = _mm_set1_epi16(*(s16*)(coeff + 6)); + __m256i max_pel = _mm256_set1_epi16((pel)max_val); + __m256i mAddOffset = _mm256_set1_epi32(32); + __m128i s0, s1, s2, s3, s4, s5, s6, s7, s8; + __m256i T0, T1, T2, T3, T4, T5, T6, T7; + __m256i N0, N1, N2, N3, N4, N5, N6, N7; + __m256i coeff00 = _mm256_cvtepi8_epi16(coeff0); + __m256i coeff01 = _mm256_cvtepi8_epi16(coeff1); + __m256i coeff02 = _mm256_cvtepi8_epi16(coeff2); + __m256i coeff03 = _mm256_cvtepi8_epi16(coeff3); + + src -= i_src3; + + while (height > 0) { + s0 = _mm_loadu_si128((__m128i*)(src)); + s1 = _mm_loadu_si128((__m128i*)(src + i_src)); + s2 = _mm_loadu_si128((__m128i*)(src + i_src2)); + s3 = _mm_loadu_si128((__m128i*)(src + i_src3)); + s4 = _mm_loadu_si128((__m128i*)(src + i_src4)); + s5 = _mm_loadu_si128((__m128i*)(src + i_src5)); + s6 = _mm_loadu_si128((__m128i*)(src + i_src6)); + s7 = _mm_loadu_si128((__m128i*)(src + i_src7)); + s8 = _mm_loadu_si128((__m128i*)(src + (i_src << 3))); + + height -= 2; + src += i_src2; + uavs3d_prefetch(src + i_src7, _MM_HINT_NTA); + + T0 = _mm256_set_m128i(s1, s0); + T1 = _mm256_set_m128i(s2, s1); + T2 = _mm256_set_m128i(s3, s2); + T3 = _mm256_set_m128i(s4, s3); + T4 = _mm256_set_m128i(s5, s4); + T5 = _mm256_set_m128i(s6, s5); + T6 = _mm256_set_m128i(s7, s6); + T7 = _mm256_set_m128i(s8, s7); + + N0 = _mm256_unpacklo_epi16(T0, T1); + N1 = _mm256_unpacklo_epi16(T2, T3); + N2 = _mm256_unpacklo_epi16(T4, T5); + N3 = _mm256_unpacklo_epi16(T6, T7); + N4 = _mm256_unpackhi_epi16(T0, T1); + N5 = _mm256_unpackhi_epi16(T2, T3); + N6 = _mm256_unpackhi_epi16(T4, T5); + N7 = _mm256_unpackhi_epi16(T6, T7); + + N0 = _mm256_madd_epi16(N0, coeff00); + N1 = _mm256_madd_epi16(N1, coeff01); + N2 = _mm256_madd_epi16(N2, coeff02); + N3 = _mm256_madd_epi16(N3, coeff03); + N4 = _mm256_madd_epi16(N4, coeff00); + N5 = _mm256_madd_epi16(N5, coeff01); + N6 = _mm256_madd_epi16(N6, coeff02); + N7 = _mm256_madd_epi16(N7, coeff03); + + N0 = _mm256_add_epi32(N0, N1); + N1 = _mm256_add_epi32(N2, N3); + N2 = _mm256_add_epi32(N4, N5); + N3 = _mm256_add_epi32(N6, N7); + + N0 = _mm256_add_epi32(N0, N1); + N1 = _mm256_add_epi32(N2, N3); + + N0 = _mm256_add_epi32(N0, mAddOffset); + N1 = _mm256_add_epi32(N1, mAddOffset); + N0 = _mm256_srai_epi32(N0, 6); + N1 = _mm256_srai_epi32(N1, 6); + N0 = _mm256_packus_epi32(N0, N1); + N0 = _mm256_min_epu16(N0, max_pel); + _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(N0)); + _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(N0, 1)); + + dst += i_dst << 1; + } +} + void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coeff, int max_val) { const int i_src2 = i_src * 2; @@ -3412,7 +3618,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, __m256i max_pel = _mm256_set1_epi16((pel)max_val); __m256i mAddOffset = _mm256_set1_epi32(32); __m256i T0, T1, T2, T3, T4, T5, T6, T7; - __m256i M0, M1, M2, M3, M4, M5, M6, M7; __m256i N0, N1, N2, N3, N4, N5, N6, N7; __m256i coeff00 = _mm256_cvtepi8_epi16(coeff0); __m256i coeff01 = _mm256_cvtepi8_epi16(coeff1); @@ -3422,7 +3627,6 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, src -= 3 * i_src; while (height--) { - uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); T0 = _mm256_loadu_si256((__m256i*)(src)); T1 = _mm256_loadu_si256((__m256i*)(src + i_src)); T2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); @@ -3431,24 +3635,25 @@ void uavs3d_if_ver_luma_w16_avx2(const pel *src, int i_src, pel *dst, int i_dst, T5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); T6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); T7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); + uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); - M0 = _mm256_unpacklo_epi16(T0, T1); - M1 = _mm256_unpacklo_epi16(T2, T3); - M2 = _mm256_unpacklo_epi16(T4, T5); - M3 = _mm256_unpacklo_epi16(T6, T7); - M4 = _mm256_unpackhi_epi16(T0, T1); - M5 = _mm256_unpackhi_epi16(T2, T3); - M6 = _mm256_unpackhi_epi16(T4, T5); - M7 = _mm256_unpackhi_epi16(T6, T7); - - N0 = _mm256_madd_epi16(M0, coeff00); - N1 = _mm256_madd_epi16(M1, coeff01); - N2 = _mm256_madd_epi16(M2, coeff02); - N3 = _mm256_madd_epi16(M3, coeff03); - N4 = _mm256_madd_epi16(M4, coeff00); - N5 = _mm256_madd_epi16(M5, coeff01); - N6 = _mm256_madd_epi16(M6, coeff02); - N7 = _mm256_madd_epi16(M7, coeff03); + N0 = _mm256_unpacklo_epi16(T0, T1); + N1 = _mm256_unpacklo_epi16(T2, T3); + N2 = _mm256_unpacklo_epi16(T4, T5); + N3 = _mm256_unpacklo_epi16(T6, T7); + N4 = _mm256_unpackhi_epi16(T0, T1); + N5 = _mm256_unpackhi_epi16(T2, T3); + N6 = _mm256_unpackhi_epi16(T4, T5); + N7 = _mm256_unpackhi_epi16(T6, T7); + + N0 = _mm256_madd_epi16(N0, coeff00); + N1 = _mm256_madd_epi16(N1, coeff01); + N2 = _mm256_madd_epi16(N2, coeff02); + N3 = _mm256_madd_epi16(N3, coeff03); + N4 = _mm256_madd_epi16(N4, coeff00); + N5 = _mm256_madd_epi16(N5, coeff01); + N6 = _mm256_madd_epi16(N6, coeff02); + N7 = _mm256_madd_epi16(N7, coeff03); N0 = _mm256_add_epi32(N0, N1); N1 = _mm256_add_epi32(N2, N3); @@ -3568,20 +3773,23 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds while (height) { __m256i S0, S1, S2, S3, S4; - uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA); - height -= 2; S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); + height -= 2; + src += i_src2; + T0 = _mm256_unpacklo_epi16(S0, S1); T1 = _mm256_unpackhi_epi16(S0, S1); T2 = _mm256_unpacklo_epi16(S2, S3); T3 = _mm256_unpackhi_epi16(S2, S3); + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src4, _MM_HINT_NTA); + T0 = _mm256_madd_epi16(T0, coeff0); T1 = _mm256_madd_epi16(T1, coeff0); T2 = _mm256_madd_epi16(T2, coeff1); @@ -3621,7 +3829,6 @@ void uavs3d_if_ver_chroma_w16_avx2(const pel *src, int i_src, pel *dst, int i_ds mVal1 = _mm256_min_epu16(mVal1, max_pel); _mm256_storeu_si256((__m256i*)(dst + i_dst), mVal1); - src += 2 * i_src; dst += 2 * i_dst; } } @@ -3645,9 +3852,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds src -= i_src; while (height) { - uavs3d_prefetch(src + 5 * i_src, _MM_HINT_NTA); - uavs3d_prefetch(src + 6 * i_src, _MM_HINT_NTA); - height -= 2; S0 = _mm256_loadu_si256((__m256i*)(src)); S5 = _mm256_loadu_si256((__m256i*)(src + 16)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); @@ -3659,6 +3863,9 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); S9 = _mm256_loadu_si256((__m256i*)(src + i_src4 + 16)); + height -= 2; + src += i_src2; + T0 = _mm256_unpacklo_epi16(S0, S1); T1 = _mm256_unpackhi_epi16(S0, S1); T2 = _mm256_unpacklo_epi16(S2, S3); @@ -3668,6 +3875,9 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds T6 = _mm256_unpacklo_epi16(S7, S8); T7 = _mm256_unpackhi_epi16(S7, S8); + uavs3d_prefetch(src + i_src3, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src4, _MM_HINT_NTA); + T0 = _mm256_madd_epi16(T0, coeff0); T1 = _mm256_madd_epi16(T1, coeff0); T2 = _mm256_madd_epi16(T2, coeff1); @@ -3738,7 +3948,6 @@ void uavs3d_if_ver_chroma_w32_avx2(const pel *src, int i_src, pel *dst, int i_ds _mm256_storeu_si256((__m256i*)(dst + i_dst), T0); _mm256_storeu_si256((__m256i*)(dst + i_dst + 16), T2); - src += 2 * i_src; dst += 2 * i_dst; } } @@ -3820,22 +4029,18 @@ void uavs3d_if_ver_chroma_w32x_avx2(const pel *src, int i_src, pel *dst, int i_d } } -void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) +void uavs3d_if_hor_ver_luma_w4_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) { - ALIGNED_32(s16 tmp_res[(128 + 7) * 128]); + ALIGNED_32(s16 tmp_res[(32 + 7) * 4]); s16 *tmp = tmp_res; - int row, i;; + int row; int add1, shift1; int add2, shift2; - __m128i mCoef0; - __m256i mCoef, offset; - __m256i T0, T1, T2, T3, T4, T5, T6, T7; - __m256i M0, M1, M2, M3, M4, M5, M6, M7; - __m256i N0, N1, N2, N3, N4, N5, N6, N7; - int i_tmp = width; - s32 * coef; - __m128i coeff0, coeff1, coeff2, coeff3; - __m256i coeff00, coeff01, coeff02, coeff03; + __m256i offset; + __m256i T0, T1, T2, T3; + __m256i M0, M1, M2, M3; + const int i_tmp = 4; + __m256i mCoef0, mCoef1, mCoef2, mCoef3; __m256i max_pel = _mm256_set1_epi16((pel)max_val); if (max_val == 255) { // 8 bit_depth @@ -3851,80 +4056,311 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i add2 = 1 << (shift2 - 1); src += -3 * i_src - 3; - coef = (s32*)coef_x; - mCoef0 = _mm_setr_epi32(coef[0], coef[1], coef[0], coef[1]); - mCoef = _mm256_cvtepi8_epi16(mCoef0); - offset = _mm256_set1_epi32(add1); - row = height + 7; + { + __m128i s0, s1, s2, s3; + __m256i S0, S1; + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3])); + offset = _mm256_set1_epi32(add1); + + row = height + 6; + + while (row > 0) { + s0 = _mm_loadu_si128((__m128i*)(src)); + s1 = _mm_loadu_si128((__m128i*)(src + 4)); + s2 = _mm_loadu_si128((__m128i*)(src + i_src)); + s3 = _mm_loadu_si128((__m128i*)(src + i_src + 4)); + row -= 2; + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - while (row--) { - const pel *p = src; - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - for (i = 0; i < width; i += 16) { + S0 = _mm256_set_m128i(s2, s0); + S1 = _mm256_set_m128i(s3, s1); + + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); - T0 = _mm256_loadu_si256((__m256i*)p++); - T1 = _mm256_loadu_si256((__m256i*)p++); - T2 = _mm256_loadu_si256((__m256i*)p++); - T3 = _mm256_loadu_si256((__m256i*)p++); - T4 = _mm256_loadu_si256((__m256i*)p++); - T5 = _mm256_loadu_si256((__m256i*)p++); - T6 = _mm256_loadu_si256((__m256i*)p++); - T7 = _mm256_loadu_si256((__m256i*)p++); - - M0 = _mm256_madd_epi16(T0, mCoef); - M1 = _mm256_madd_epi16(T1, mCoef); - M2 = _mm256_madd_epi16(T2, mCoef); - M3 = _mm256_madd_epi16(T3, mCoef); - M4 = _mm256_madd_epi16(T4, mCoef); - M5 = _mm256_madd_epi16(T5, mCoef); - M6 = _mm256_madd_epi16(T6, mCoef); - M7 = _mm256_madd_epi16(T7, mCoef); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); - M2 = _mm256_hadd_epi32(M4, M5); - M3 = _mm256_hadd_epi32(M6, M7); - - M0 = _mm256_hadd_epi32(M0, M1); - M1 = _mm256_hadd_epi32(M2, M3); + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + + M0 = _mm256_add_epi32(M0, M1); + + M2 = _mm256_add_epi32(M0, offset); + M2 = _mm256_srai_epi32(M2, shift1); + + s0 = _mm_packs_epi32(_mm256_castsi256_si128(M2), _mm256_extracti128_si256(M2, 1)); + _mm_store_si128((__m128i*)(tmp), s0); + + tmp += i_tmp * 2; + } + { + // the last row + __m128i t0, t1, t2, t3; + __m128i m0, m1, m2, m3; + s0 = _mm_loadu_si128((__m128i*)(src)); + s1 = _mm_loadu_si128((__m128i*)(src + 4)); + src += i_src; + + t0 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle0)); + t1 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle1)); + t2 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle0)); + t3 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle1)); + + m0 = _mm_madd_epi16(t0, _mm256_castsi256_si128(mCoef0)); + m1 = _mm_madd_epi16(t1, _mm256_castsi256_si128(mCoef1)); + m2 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef2)); + m3 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef3)); + + m0 = _mm_add_epi32(m0, m1); + m1 = _mm_add_epi32(m2, m3); + + m0 = _mm_add_epi32(m0, m1); + + m0 = _mm_add_epi32(m0, _mm256_castsi256_si128(offset)); + m0 = _mm_srai_epi32(m0, shift1); + m0 = _mm_packs_epi32(m0, m0); + _mm_storel_epi64((__m128i*)tmp, m0); + } + } + + { + __m256i T4, T5, T6, T7, M4, M5, M6, M7; + __m128i d0, d1; + + offset = _mm256_set1_epi32(add2); + tmp = tmp_res; + + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3])); + + while (height > 0) { + T0 = _mm256_load_si256((__m256i*)(tmp)); + T1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); + T2 = _mm256_loadu_si256((__m256i*)(tmp + 2 * i_tmp)); + T3 = _mm256_loadu_si256((__m256i*)(tmp + 3 * i_tmp)); + T4 = _mm256_load_si256((__m256i*)(tmp + 4 * i_tmp)); + T5 = _mm256_loadu_si256((__m256i*)(tmp + 5 * i_tmp)); + T6 = _mm256_loadu_si256((__m256i*)(tmp + 6 * i_tmp)); + T7 = _mm256_loadu_si256((__m256i*)(tmp + 7 * i_tmp)); + height -= 4; + tmp += i_tmp * 4; + + M0 = _mm256_unpacklo_epi16(T0, T1); + M1 = _mm256_unpacklo_epi16(T2, T3); + M2 = _mm256_unpacklo_epi16(T4, T5); + M3 = _mm256_unpacklo_epi16(T6, T7); + M4 = _mm256_unpackhi_epi16(T0, T1); + M5 = _mm256_unpackhi_epi16(T2, T3); + M6 = _mm256_unpackhi_epi16(T4, T5); + M7 = _mm256_unpackhi_epi16(T6, T7); + + M0 = _mm256_madd_epi16(M0, mCoef0); + M1 = _mm256_madd_epi16(M1, mCoef1); + M2 = _mm256_madd_epi16(M2, mCoef2); + M3 = _mm256_madd_epi16(M3, mCoef3); + M4 = _mm256_madd_epi16(M4, mCoef0); + M5 = _mm256_madd_epi16(M5, mCoef1); + M6 = _mm256_madd_epi16(M6, mCoef2); + M7 = _mm256_madd_epi16(M7, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + + M0 = _mm256_add_epi32(M0, offset); + M1 = _mm256_add_epi32(M1, offset); + M0 = _mm256_srai_epi32(M0, shift2); + M1 = _mm256_srai_epi32(M1, shift2); + M0 = _mm256_packus_epi32(M0, M1); + M0 = _mm256_min_epu16(M0, max_pel); + + d0 = _mm256_castsi256_si128(M0); + d1 = _mm256_extracti128_si256(M0, 1); + _mm_storel_epi64((__m128i*)(dst), d0); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(d0)); + _mm_storel_epi64((__m128i*)(dst + (i_dst << 1)), d1); + _mm_storeh_pi((__m64*)(dst + i_dst * 3), _mm_castsi128_ps(d1)); + + dst += i_dst << 2; + } + } +} + +void uavs3d_if_hor_ver_luma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) +{ + ALIGNED_32(s16 tmp_res[(64 + 7) * 8]); + s16 *tmp = tmp_res; + int row; + int add1, shift1; + int add2, shift2; + __m256i offset; + __m256i T0, T1, T2, T3, T4, T5; + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + const int i_tmp = 8; + __m256i mCoef0, mCoef1, mCoef2, mCoef3; + __m256i max_pel = _mm256_set1_epi16((pel)max_val); + + if (max_val == 255) { // 8 bit_depth + shift1 = 0; + shift2 = 12; + } + else { // 10 bit_depth + shift1 = 2; + shift2 = 10; + } + + add1 = (1 << (shift1)) >> 1; + add2 = 1 << (shift2 - 1); + + src += -3 * i_src - 3; + + { + __m128i s0, s1; + __m256i S0, S1, S2; + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3])); + offset = _mm256_set1_epi32(add1); + + row = height + 6; + + while (row > 0) { + T0 = _mm256_loadu_si256((__m256i*)(src)); + s0 = _mm_loadu_si128((__m128i*)(src + 4)); + T1 = _mm256_loadu_si256((__m256i*)(src + i_src)); + s1 = _mm_loadu_si128((__m128i*)(src + i_src + 4)); + row -= 2; + src += i_src << 1; + uavs3d_prefetch(src, _MM_HINT_NTA); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + + S0 = _mm256_permute2x128_si256(T0, T1, 0x20); + S2 = _mm256_permute2x128_si256(T0, T1, 0x31); + S1 = _mm256_set_m128i(s1, s0); + + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); + T4 = _mm256_shuffle_epi8(S2, mShuffle0); + T5 = _mm256_shuffle_epi8(S2, mShuffle1); + + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + M4 = _mm256_madd_epi16(T2, mCoef0); + M5 = _mm256_madd_epi16(T3, mCoef1); + M6 = _mm256_madd_epi16(T4, mCoef2); + M7 = _mm256_madd_epi16(T5, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); M2 = _mm256_add_epi32(M0, offset); M3 = _mm256_add_epi32(M1, offset); M2 = _mm256_srai_epi32(M2, shift1); M3 = _mm256_srai_epi32(M3, shift1); M2 = _mm256_packs_epi32(M2, M3); - _mm256_storeu_si256((__m256i*)(tmp + i), M2); - p += 8; + _mm256_store_si256((__m256i*)(tmp), M2); + + tmp += i_tmp * 2; + } + { + // the last row + __m128i t0, t1, t2, t3, t4, t5; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i s2; + s0 = _mm_loadu_si128((__m128i*)(src)); + s1 = _mm_loadu_si128((__m128i*)(src + 4)); + s2 = _mm_loadu_si128((__m128i*)(src + 8)); + src += i_src; + + t0 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle0)); + t1 = _mm_shuffle_epi8(s0, _mm256_castsi256_si128(mShuffle1)); + t2 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle0)); + t3 = _mm_shuffle_epi8(s1, _mm256_castsi256_si128(mShuffle1)); + t4 = _mm_shuffle_epi8(s2, _mm256_castsi256_si128(mShuffle0)); + t5 = _mm_shuffle_epi8(s2, _mm256_castsi256_si128(mShuffle1)); + + m0 = _mm_madd_epi16(t0, _mm256_castsi256_si128(mCoef0)); + m1 = _mm_madd_epi16(t1, _mm256_castsi256_si128(mCoef1)); + m2 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef2)); + m3 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef3)); + m4 = _mm_madd_epi16(t2, _mm256_castsi256_si128(mCoef0)); + m5 = _mm_madd_epi16(t3, _mm256_castsi256_si128(mCoef1)); + m6 = _mm_madd_epi16(t4, _mm256_castsi256_si128(mCoef2)); + m7 = _mm_madd_epi16(t5, _mm256_castsi256_si128(mCoef3)); + + m0 = _mm_add_epi32(m0, m1); + m1 = _mm_add_epi32(m2, m3); + m2 = _mm_add_epi32(m4, m5); + m3 = _mm_add_epi32(m6, m7); + + m0 = _mm_add_epi32(m0, m1); + m1 = _mm_add_epi32(m2, m3); + + m2 = _mm_add_epi32(m0, _mm256_castsi256_si128(offset)); + m3 = _mm_add_epi32(m1, _mm256_castsi256_si128(offset)); + m2 = _mm_srai_epi32(m2, shift1); + m3 = _mm_srai_epi32(m3, shift1); + m2 = _mm_packs_epi32(m2, m3); + _mm_store_si128((__m128i*)tmp, m2); } - tmp += i_tmp; - src += i_src; } - offset = _mm256_set1_epi32(add2); - tmp = tmp_res; + { + __m256i N0, N1, N2, N3, N4, N5, N6, N7; + __m256i T6, T7; + offset = _mm256_set1_epi32(add2); + tmp = tmp_res; - coeff0 = _mm_set1_epi16(*(s16*)(coef_y)); - coeff1 = _mm_set1_epi16(*(s16*)(coef_y + 2)); - coeff2 = _mm_set1_epi16(*(s16*)(coef_y + 4)); - coeff3 = _mm_set1_epi16(*(s16*)(coef_y + 6)); - coeff00 = _mm256_cvtepi8_epi16(coeff0); - coeff01 = _mm256_cvtepi8_epi16(coeff1); - coeff02 = _mm256_cvtepi8_epi16(coeff2); - coeff03 = _mm256_cvtepi8_epi16(coeff3); + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3])); - while (height--) { - const pel *p = (pel*)tmp; - for (i = 0; i < width; i += 16) { - T0 = _mm256_load_si256((__m256i*)(p)); - T1 = _mm256_load_si256((__m256i*)(p + i_tmp)); - T2 = _mm256_load_si256((__m256i*)(p + 2 * i_tmp)); - T3 = _mm256_load_si256((__m256i*)(p + 3 * i_tmp)); - T4 = _mm256_load_si256((__m256i*)(p + 4 * i_tmp)); - T5 = _mm256_load_si256((__m256i*)(p + 5 * i_tmp)); - T6 = _mm256_load_si256((__m256i*)(p + 6 * i_tmp)); - T7 = _mm256_load_si256((__m256i*)(p + 7 * i_tmp)); + while (height > 0) { + T0 = _mm256_load_si256((__m256i*)(tmp)); + T1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); + T2 = _mm256_load_si256((__m256i*)(tmp + 2 * i_tmp)); + T3 = _mm256_loadu_si256((__m256i*)(tmp + 3 * i_tmp)); + T4 = _mm256_load_si256((__m256i*)(tmp + 4 * i_tmp)); + T5 = _mm256_loadu_si256((__m256i*)(tmp + 5 * i_tmp)); + T6 = _mm256_load_si256((__m256i*)(tmp + 6 * i_tmp)); + T7 = _mm256_loadu_si256((__m256i*)(tmp + 7 * i_tmp)); + height -= 2; + tmp += i_tmp * 2; M0 = _mm256_unpacklo_epi16(T0, T1); M1 = _mm256_unpacklo_epi16(T2, T3); @@ -3935,14 +4371,14 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i M6 = _mm256_unpackhi_epi16(T4, T5); M7 = _mm256_unpackhi_epi16(T6, T7); - N0 = _mm256_madd_epi16(M0, coeff00); - N1 = _mm256_madd_epi16(M1, coeff01); - N2 = _mm256_madd_epi16(M2, coeff02); - N3 = _mm256_madd_epi16(M3, coeff03); - N4 = _mm256_madd_epi16(M4, coeff00); - N5 = _mm256_madd_epi16(M5, coeff01); - N6 = _mm256_madd_epi16(M6, coeff02); - N7 = _mm256_madd_epi16(M7, coeff03); + N0 = _mm256_madd_epi16(M0, mCoef0); + N1 = _mm256_madd_epi16(M1, mCoef1); + N2 = _mm256_madd_epi16(M2, mCoef2); + N3 = _mm256_madd_epi16(M3, mCoef3); + N4 = _mm256_madd_epi16(M4, mCoef0); + N5 = _mm256_madd_epi16(M5, mCoef1); + N6 = _mm256_madd_epi16(M6, mCoef2); + N7 = _mm256_madd_epi16(M7, mCoef3); N0 = _mm256_add_epi32(N0, N1); N1 = _mm256_add_epi32(N2, N3); @@ -3958,14 +4394,164 @@ void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i N1 = _mm256_srai_epi32(N1, shift2); N0 = _mm256_packus_epi32(N0, N1); N0 = _mm256_min_epu16(N0, max_pel); - _mm256_storeu_si256((__m256i*)(dst + i), N0); - p += 16; + _mm_storeu_si128((__m128i*)(dst), _mm256_castsi256_si128(N0)); + _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(N0, 1)); + + dst += i_dst << 1; } - dst += i_dst; - tmp += i_tmp; + } +} + +void uavs3d_if_hor_ver_luma_w16x_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) +{ + ALIGNED_32(s16 tmp_res[(128 + 7) * 128]); + s16 *tmp = tmp_res; + int row, i;; + int add1, shift1; + int add2, shift2; + __m256i offset; + __m256i T0, T1, T2, T3, T4, T5; + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + int i_tmp = width; + __m256i mCoef0, mCoef1, mCoef2, mCoef3; + __m256i max_pel = _mm256_set1_epi16((pel)max_val); + + if (max_val == 255) { // 8 bit_depth + shift1 = 0; + shift2 = 12; + } + else { // 10 bit_depth + shift1 = 2; + shift2 = 10; } + add1 = (1 << (shift1)) >> 1; + add2 = 1 << (shift2 - 1); + + src += -3 * i_src - 3; + + { + __m256i S0, S1, S2; + __m256i mShuffle0 = _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i mShuffle1 = _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_x)[3])); + offset = _mm256_set1_epi32(add1); + + row = height + 7; + + while (row--) { + const pel *p = src; + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + for (i = 0; i < width; i += 16) { + S0 = _mm256_loadu_si256((__m256i*)(p)); + S1 = _mm256_loadu_si256((__m256i*)(p + 4)); + S2 = _mm256_loadu_si256((__m256i*)(p + 8)); + + T0 = _mm256_shuffle_epi8(S0, mShuffle0); + T1 = _mm256_shuffle_epi8(S0, mShuffle1); + T2 = _mm256_shuffle_epi8(S1, mShuffle0); + T3 = _mm256_shuffle_epi8(S1, mShuffle1); + T4 = _mm256_shuffle_epi8(S2, mShuffle0); + T5 = _mm256_shuffle_epi8(S2, mShuffle1); + + M0 = _mm256_madd_epi16(T0, mCoef0); + M1 = _mm256_madd_epi16(T1, mCoef1); + M2 = _mm256_madd_epi16(T2, mCoef2); + M3 = _mm256_madd_epi16(T3, mCoef3); + M4 = _mm256_madd_epi16(T2, mCoef0); + M5 = _mm256_madd_epi16(T3, mCoef1); + M6 = _mm256_madd_epi16(T4, mCoef2); + M7 = _mm256_madd_epi16(T5, mCoef3); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + M2 = _mm256_add_epi32(M4, M5); + M3 = _mm256_add_epi32(M6, M7); + + M0 = _mm256_add_epi32(M0, M1); + M1 = _mm256_add_epi32(M2, M3); + + p += 16; + M2 = _mm256_add_epi32(M0, offset); + M3 = _mm256_add_epi32(M1, offset); + M2 = _mm256_srai_epi32(M2, shift1); + M3 = _mm256_srai_epi32(M3, shift1); + M2 = _mm256_packs_epi32(M2, M3); + _mm256_storeu_si256((__m256i*)(tmp + i), M2); + } + tmp += i_tmp; + src += i_src; + } + } + + { + __m256i N0, N1, N2, N3, N4, N5, N6, N7; + __m256i T6, T7; + offset = _mm256_set1_epi32(add2); + tmp = tmp_res; + + mCoef0 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[0])); + mCoef1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[1])); + mCoef2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[2])); + mCoef3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(((short*)coef_y)[3])); + + while (height--) { + const pel *p = (pel*)tmp; + for (i = 0; i < width; i += 16) { + T0 = _mm256_load_si256((__m256i*)(p)); + T1 = _mm256_load_si256((__m256i*)(p + i_tmp)); + T2 = _mm256_load_si256((__m256i*)(p + 2 * i_tmp)); + T3 = _mm256_load_si256((__m256i*)(p + 3 * i_tmp)); + T4 = _mm256_load_si256((__m256i*)(p + 4 * i_tmp)); + T5 = _mm256_load_si256((__m256i*)(p + 5 * i_tmp)); + T6 = _mm256_load_si256((__m256i*)(p + 6 * i_tmp)); + T7 = _mm256_load_si256((__m256i*)(p + 7 * i_tmp)); + + M0 = _mm256_unpacklo_epi16(T0, T1); + M1 = _mm256_unpacklo_epi16(T2, T3); + M2 = _mm256_unpacklo_epi16(T4, T5); + M3 = _mm256_unpacklo_epi16(T6, T7); + M4 = _mm256_unpackhi_epi16(T0, T1); + M5 = _mm256_unpackhi_epi16(T2, T3); + M6 = _mm256_unpackhi_epi16(T4, T5); + M7 = _mm256_unpackhi_epi16(T6, T7); + + N0 = _mm256_madd_epi16(M0, mCoef0); + N1 = _mm256_madd_epi16(M1, mCoef1); + N2 = _mm256_madd_epi16(M2, mCoef2); + N3 = _mm256_madd_epi16(M3, mCoef3); + N4 = _mm256_madd_epi16(M4, mCoef0); + N5 = _mm256_madd_epi16(M5, mCoef1); + N6 = _mm256_madd_epi16(M6, mCoef2); + N7 = _mm256_madd_epi16(M7, mCoef3); + + N0 = _mm256_add_epi32(N0, N1); + N1 = _mm256_add_epi32(N2, N3); + N2 = _mm256_add_epi32(N4, N5); + N3 = _mm256_add_epi32(N6, N7); + + N0 = _mm256_add_epi32(N0, N1); + N1 = _mm256_add_epi32(N2, N3); + + N0 = _mm256_add_epi32(N0, offset); + N1 = _mm256_add_epi32(N1, offset); + N0 = _mm256_srai_epi32(N0, shift2); + N1 = _mm256_srai_epi32(N1, shift2); + N0 = _mm256_packus_epi32(N0, N1); + N0 = _mm256_min_epu16(N0, max_pel); + _mm256_storeu_si256((__m256i*)(dst + i), N0); + + p += 16; + } + dst += i_dst; + tmp += i_tmp; + } + } } void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i_dst, int width, int height, const s8 *coef_x, const s8 *coef_y, int max_val) @@ -3979,14 +4565,6 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i int shift1, shift2; int add1, add2; - __m128i coef0 = _mm_set1_epi16(*(s16*)coef_x); - __m128i coef1 = _mm_set1_epi16(*(s16*)(coef_x + 2)); - __m256i mCoef0 = _mm256_cvtepi8_epi16(coef0); - __m256i mCoef1 = _mm256_cvtepi8_epi16(coef1); - __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11); - __m256i T0, T1, S0, S1, sum; - __m256i mAddOffset; - if (max_val == 255) { // 8 bit_depth shift1 = 0; shift2 = 12; @@ -3999,25 +4577,34 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i add1 = (1 << (shift1)) >> 1; add2 = 1 << (shift2 - 1); - mAddOffset = _mm256_set1_epi32(add1); //HOR + __m128i coef0 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coef_x)[0])); + __m128i coef1 = _mm_cvtepi8_epi16(_mm_set1_epi16(((s16*)coef_x)[1])); + __m256i mCoef0 = _mm256_set_m128i(coef1, coef0); + __m256i mCoef1 = _mm256_set_m128i(coef0, coef1); + __m256i mSwitch = _mm256_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 0, 1, 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11); + __m256i T0, T1, S0, S1, sum; + __m256i mAddOffset = _mm256_set1_epi32(add1); + __m128i mDst; + __m128i s0; + src = src - i_src - 2; row = height + 3; while (row--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); S0 = _mm256_loadu_si256((__m256i*)(src)); - S1 = _mm256_loadu_si256((__m256i*)(src + 4)); - S0 = _mm256_permute4x64_epi64(S0, 0x94); - S1 = _mm256_permute4x64_epi64(S1, 0x94); - T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0); - T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1); + s0 = _mm_loadu_si128((__m128i*)(src + 4)); + uavs3d_prefetch(src + i_src, _MM_HINT_NTA); + S1 = _mm256_set_m128i(s0, s0); + T0 = _mm256_shuffle_epi8(S0, mSwitch); + T1 = _mm256_shuffle_epi8(S1, mSwitch); + T0 = _mm256_madd_epi16(T0, mCoef0); + T1 = _mm256_madd_epi16(T1, mCoef1); sum = _mm256_add_epi32(T0, T1); sum = _mm256_add_epi32(sum, mAddOffset); sum = _mm256_srai_epi32(sum, shift1); - sum = _mm256_packs_epi32(sum, sum); - sum = _mm256_permute4x64_epi64(sum, 0xd8); - _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(sum)); + mDst = _mm_packs_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + _mm_store_si128((__m128i*)(tmp), mDst); src += i_src; tmp += i_tmp; @@ -4027,46 +4614,49 @@ void uavs3d_if_hor_ver_chroma_w8_avx2(const pel *src, int i_src, pel *dst, int i tmp = tmp_res; { - __m128i max_val1 = _mm_set1_epi16((pel)max_val); - __m128i coeff0, coeff1, mVal, mAddOffset2; - __m128i M0, M1, M2, M3; + __m128i coeff0, coeff1; + __m256i MaxVal = _mm256_set1_epi16((pel)max_val); + __m256i C0, C1, mVal, mAddOffset2; + __m256i M0, M1, M2, M3; coeff0 = _mm_set1_epi16(*(s16*)coef_y); coeff1 = _mm_set1_epi16(*(s16*)(coef_y + 2)); - mAddOffset2 = _mm_set1_epi32(add2); - - coeff0 = _mm_cvtepi8_epi16(coeff0); - coeff1 = _mm_cvtepi8_epi16(coeff1); - while (height--) { - __m128i T00 = _mm_load_si128((__m128i*)(tmp)); - __m128i T10 = _mm_load_si128((__m128i*)(tmp + i_tmp)); - __m128i T20 = _mm_load_si128((__m128i*)(tmp + i_tmp2)); - __m128i T30 = _mm_load_si128((__m128i*)(tmp + i_tmp3)); + mAddOffset2 = _mm256_set1_epi32(add2); - M0 = _mm_unpacklo_epi16(T00, T10); - M1 = _mm_unpacklo_epi16(T20, T30); - M2 = _mm_unpackhi_epi16(T00, T10); - M3 = _mm_unpackhi_epi16(T20, T30); - - M0 = _mm_madd_epi16(M0, coeff0); - M1 = _mm_madd_epi16(M1, coeff1); - M2 = _mm_madd_epi16(M2, coeff0); - M3 = _mm_madd_epi16(M3, coeff1); - - M0 = _mm_add_epi32(M0, M1); - M2 = _mm_add_epi32(M2, M3); - - M0 = _mm_add_epi32(M0, mAddOffset2); - M2 = _mm_add_epi32(M2, mAddOffset2); - M0 = _mm_srai_epi32(M0, shift2); - M2 = _mm_srai_epi32(M2, shift2); - - mVal = _mm_packus_epi32(M0, M2); - mVal = _mm_min_epu16(mVal, max_val1); - _mm_storeu_si128((__m128i*)dst, mVal); - - tmp += i_tmp; - dst += i_dst; + C0 = _mm256_cvtepi8_epi16(coeff0); + C1 = _mm256_cvtepi8_epi16(coeff1); + while (height) { + __m256i T00 = _mm256_load_si256((__m256i*)(tmp)); + __m256i T10 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); + __m256i T20 = _mm256_load_si256((__m256i*)(tmp + i_tmp2)); + __m256i T30 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3)); + + M0 = _mm256_unpacklo_epi16(T00, T10); + M1 = _mm256_unpacklo_epi16(T20, T30); + M2 = _mm256_unpackhi_epi16(T00, T10); + M3 = _mm256_unpackhi_epi16(T20, T30); + + M0 = _mm256_madd_epi16(M0, C0); + M1 = _mm256_madd_epi16(M1, C1); + M2 = _mm256_madd_epi16(M2, C0); + M3 = _mm256_madd_epi16(M3, C1); + + M0 = _mm256_add_epi32(M0, M1); + M2 = _mm256_add_epi32(M2, M3); + + M0 = _mm256_add_epi32(M0, mAddOffset2); + M2 = _mm256_add_epi32(M2, mAddOffset2); + M0 = _mm256_srai_epi32(M0, shift2); + M2 = _mm256_srai_epi32(M2, shift2); + + mVal = _mm256_packus_epi32(M0, M2); + mVal = _mm256_min_epu16(mVal, MaxVal); + _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(mVal)); + _mm_storeu_si128((__m128i*)(dst + i_dst), _mm256_extracti128_si256(mVal, 1)); + + height -= 2; + tmp += i_tmp2; + dst += i_dst << 1; } } } @@ -4113,28 +4703,26 @@ void uavs3d_if_hor_ver_chroma_w16x_avx2(const pel *src, int i_src, pel *dst, int while (row--) { uavs3d_prefetch(src + i_src, _MM_HINT_NTA); for (col = 0; col < width; col += 16) { - S0 = _mm256_loadu_si256((__m256i*)(src + col)); - S1 = _mm256_loadu_si256((__m256i*)(src + col + 4)); + S0 = _mm256_loadu_si256((__m256i*)(src + col)); + S1 = _mm256_loadu_si256((__m256i*)(src + col + 4)); S2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); - S3 = _mm256_loadu_si256((__m256i*)(src + col + 12)); - S0 = _mm256_permute4x64_epi64(S0, 0x94); - S1 = _mm256_permute4x64_epi64(S1, 0x94); - S2 = _mm256_permute4x64_epi64(S2, 0x94); - S3 = _mm256_permute4x64_epi64(S3, 0x94); - T0 = _mm256_madd_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef0); - T1 = _mm256_madd_epi16(_mm256_shuffle_epi8(S1, mSwitch), mCoef1); - T2 = _mm256_madd_epi16(_mm256_shuffle_epi8(S2, mSwitch), mCoef0); - T3 = _mm256_madd_epi16(_mm256_shuffle_epi8(S3, mSwitch), mCoef1); - T0 = _mm256_add_epi32(T0, T1); - T2 = _mm256_add_epi32(T2, T3); + T0 = _mm256_shuffle_epi8(S0, mSwitch); + T1 = _mm256_shuffle_epi8(S1, mSwitch); + T2 = _mm256_shuffle_epi8(S1, mSwitch); + T3 = _mm256_shuffle_epi8(S2, mSwitch); + S0 = _mm256_madd_epi16(T0, mCoef0); + S1 = _mm256_madd_epi16(T1, mCoef1); + S2 = _mm256_madd_epi16(T2, mCoef0); + S3 = _mm256_madd_epi16(T3, mCoef1); + T0 = _mm256_add_epi32(S0, S1); + T2 = _mm256_add_epi32(S2, S3); T0 = _mm256_add_epi32(T0, mAddOffset); T2 = _mm256_add_epi32(T2, mAddOffset); T0 = _mm256_srai_epi32(T0, shift1); T2 = _mm256_srai_epi32(T2, shift1); T0 = _mm256_packs_epi32(T0, T2); - T0 = _mm256_permute4x64_epi64(T0, 0xd8); - _mm256_storeu_si256((__m256i*)(tmp + col), T0); + _mm256_store_si256((__m256i*)(tmp + col), T0); } src += i_src; tmp += i_tmp; diff --git a/source/decore/avx2/intra_pred_avx2.c b/source/decore/avx2/intra_pred_avx2.c index 07e19b2..18961f6 100644 --- a/source/decore/avx2/intra_pred_avx2.c +++ b/source/decore/avx2/intra_pred_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -204,7 +199,7 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height) void uavs3d_ipred_dc_avx2(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth) { - int i, x, y; + int x, y; int dc; pel *p_src = src - 1; int left_avail = IS_AVAIL(avail_cu, AVAIL_LE); @@ -212,6 +207,7 @@ void uavs3d_ipred_dc_avx2(pel *src, pel *dst, int i_dst, int width, int height, if (left_avail && above_avail) { int length = width + height + 1; + int i; __m128i sum = _mm_setzero_si128(); __m128i val; @@ -2738,7 +2734,6 @@ void uavs3d_ipred_ang_xy_18_avx2(pel *src, pel *dst, int i_dst, int mode, int wi dst += i_dst; } break; - break; } } @@ -3452,10 +3447,10 @@ void uavs3d_ipred_ver_avx2(pel *src, pel *dst, int i_dst, int width, int height) T0 = _mm256_loadu_si256((__m256i *)(src)); T1 = _mm256_loadu_si256((__m256i *)(src + 16)); for (y = 0; y < height; y += 2) { - _mm256_store_si256((__m256i *)(dst), T0); - _mm256_store_si256((__m256i *)(dst + 16), T1); - _mm256_store_si256((__m256i *)(dst + i_dst), T0); - _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst), T0); + _mm256_storeu_si256((__m256i *)(dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst), T0); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1); dst += i_dst2; } break; @@ -3468,14 +3463,14 @@ void uavs3d_ipred_ver_avx2(pel *src, pel *dst, int i_dst, int width, int height) T2 = _mm256_loadu_si256((__m256i *)(src + 32)); T3 = _mm256_loadu_si256((__m256i *)(src + 48)); for (y = 0; y < height; y += 2) { - _mm256_store_si256((__m256i *)(dst), T0); - _mm256_store_si256((__m256i *)(dst + 16), T1); - _mm256_store_si256((__m256i *)(dst + 32), T2); - _mm256_store_si256((__m256i *)(dst + 48), T3); - _mm256_store_si256((__m256i *)(dst + i_dst), T0); - _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1); - _mm256_store_si256((__m256i *)(dst + i_dst + 32), T2); - _mm256_store_si256((__m256i *)(dst + i_dst + 48), T3); + _mm256_storeu_si256((__m256i *)(dst), T0); + _mm256_storeu_si256((__m256i *)(dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst + 32), T2); + _mm256_storeu_si256((__m256i *)(dst + 48), T3); + _mm256_storeu_si256((__m256i *)(dst + i_dst), T0); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 32), T2); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 48), T3); dst += i_dst2; } break; @@ -3539,10 +3534,10 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height) for (y = 0; y < height; y += 2) { T0 = _mm256_set1_epi16(src[-y]); T1 = _mm256_set1_epi16(src[-y - 1]); - _mm256_store_si256((__m256i *)(dst), T0); - _mm256_store_si256((__m256i *)(dst + 16), T0); - _mm256_store_si256((__m256i *)(dst + i_dst), T1); - _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst), T0); + _mm256_storeu_si256((__m256i *)(dst + 16), T0); + _mm256_storeu_si256((__m256i *)(dst + i_dst), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1); dst += i_dst2; } break; @@ -3553,14 +3548,14 @@ void uavs3d_ipred_hor_avx2(pel *src, pel *dst, int i_dst, int width, int height) for (y = 0; y < height; y += 2) { T0 = _mm256_set1_epi16(src[-y]); T1 = _mm256_set1_epi16(src[-y - 1]); - _mm256_store_si256((__m256i *)(dst), T0); - _mm256_store_si256((__m256i *)(dst + 16), T0); - _mm256_store_si256((__m256i *)(dst + 32), T0); - _mm256_store_si256((__m256i *)(dst + 48), T0); - _mm256_store_si256((__m256i *)(dst + i_dst), T1); - _mm256_store_si256((__m256i *)(dst + i_dst + 16), T1); - _mm256_store_si256((__m256i *)(dst + i_dst + 32), T1); - _mm256_store_si256((__m256i *)(dst + i_dst + 48), T1); + _mm256_storeu_si256((__m256i *)(dst), T0); + _mm256_storeu_si256((__m256i *)(dst + 16), T0); + _mm256_storeu_si256((__m256i *)(dst + 32), T0); + _mm256_storeu_si256((__m256i *)(dst + 48), T0); + _mm256_storeu_si256((__m256i *)(dst + i_dst), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 16), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 32), T1); + _mm256_storeu_si256((__m256i *)(dst + i_dst + 48), T1); dst += i_dst2; } break; diff --git a/source/decore/avx2/itrans_avx2.c b/source/decore/avx2/itrans_avx2.c index 9b1df21..d8766b6 100644 --- a/source/decore/avx2/itrans_avx2.c +++ b/source/decore/avx2/itrans_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -79,14 +74,14 @@ #define TRANSPOSE_16x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I08, I09, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7)\ TRANSPOSE_8x8_32BIT_16BIT(I00, I01, I02, I03, I04, I05, I06, I07, I04, I05, I06, I07); \ TRANSPOSE_8x8_32BIT_16BIT(I08, I09, I10, I11, I12, I13, I14, I15, I12, I13, I14, I15); \ - O0 = _mm256_insertf128_si256(I04, _mm256_castsi256_si128(I12), 1); \ - O1 = _mm256_insertf128_si256(I05, _mm256_castsi256_si128(I13), 1); \ - O2 = _mm256_insertf128_si256(I06, _mm256_castsi256_si128(I14), 1); \ - O3 = _mm256_insertf128_si256(I07, _mm256_castsi256_si128(I15), 1); \ - O4 = _mm256_insertf128_si256(I12, _mm256_extracti128_si256(I04, 1), 0); \ - O5 = _mm256_insertf128_si256(I13, _mm256_extracti128_si256(I05, 1), 0); \ - O6 = _mm256_insertf128_si256(I14, _mm256_extracti128_si256(I06, 1), 0); \ - O7 = _mm256_insertf128_si256(I15, _mm256_extracti128_si256(I07, 1), 0) + O0 = _mm256_permute2x128_si256(I04, I12, 0x20); \ + O1 = _mm256_permute2x128_si256(I05, I13, 0x20); \ + O2 = _mm256_permute2x128_si256(I06, I14, 0x20); \ + O3 = _mm256_permute2x128_si256(I07, I15, 0x20); \ + O4 = _mm256_permute2x128_si256(I04, I12, 0x31); \ + O5 = _mm256_permute2x128_si256(I05, I13, 0x31); \ + O6 = _mm256_permute2x128_si256(I06, I14, 0x31); \ + O7 = _mm256_permute2x128_si256(I07, I15, 0x31) static void uavs3d_always_inline dct2_butterfly_h4_avx2(s16* src, s16* dst, int line, int shift, int bit_depth) @@ -272,10 +267,10 @@ static void uavs3d_always_inline dct2_butterfly_h8_avx2(s16* src, int i_src, s16 // transpose 8x8 : 8 x 8(32bit) --> 4 x 16(16bit) TRANSPOSE_8x8_32BIT_16BIT(d0, d1, d2, d3, d4, d5, d6, d7, d4, d5, d6, d7); - d0 = _mm256_insertf128_si256(d4, _mm256_castsi256_si128(d5), 1); - d1 = _mm256_insertf128_si256(d6, _mm256_castsi256_si128(d7), 1); - d2 = _mm256_insertf128_si256(d5, _mm256_extracti128_si256(d4, 1), 0); - d3 = _mm256_insertf128_si256(d7, _mm256_extracti128_si256(d6, 1), 0); + d0 = _mm256_permute2x128_si256(d4, d5, 0x20); + d2 = _mm256_permute2x128_si256(d4, d5, 0x31); + d1 = _mm256_permute2x128_si256(d6, d7, 0x20); + d3 = _mm256_permute2x128_si256(d6, d7, 0x31); if (bit_depth != MAX_TX_DYNAMIC_RANGE) { __m256i max_val = _mm256_set1_epi16((1 << bit_depth) - 1); diff --git a/source/decore/avx2/pixel_avx2.c b/source/decore/avx2/pixel_avx2.c index 8031fe7..10d48f9 100644 --- a/source/decore/avx2/pixel_avx2.c +++ b/source/decore/avx2/pixel_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -986,8 +981,8 @@ void uavs3d_recon_chroma_w16_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int width r1 = _mm256_loadu_si256((const __m256i*)(resi_v)); r2 = _mm256_unpacklo_epi16(r0, r1); // UV interlaced: uv0-uv4 uv8-uv12 r3 = _mm256_unpackhi_epi16(r0, r1); - r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8 - r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0); + r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8 + r1 = _mm256_permute2x128_si256(r2, r3, 0x31); p0 = _mm256_adds_epi16(p0, r0); p1 = _mm256_adds_epi16(p1, r1); @@ -1035,8 +1030,8 @@ void uavs3d_recon_chroma_w16_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int width r1 = _mm256_loadu_si256((const __m256i*)(resi_v)); r2 = _mm256_unpacklo_epi16(zero, r1); // UV interlaced: uv0-uv4 uv8-uv12 r3 = _mm256_unpackhi_epi16(zero, r1); - r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8 - r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0); + r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8 + r1 = _mm256_permute2x128_si256(r2, r3, 0x31); p0 = _mm256_adds_epi16(p0, r0); p1 = _mm256_adds_epi16(p1, r1); @@ -1073,8 +1068,8 @@ void uavs3d_recon_chroma_w16x_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int widt r1 = _mm256_loadu_si256((const __m256i*)(resi_v + j)); r2 = _mm256_unpacklo_epi16(r0, r1); // UV interlaced: uv0-uv4 uv8-uv12 r3 = _mm256_unpackhi_epi16(r0, r1); - r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8 - r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0); + r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8 + r1 = _mm256_permute2x128_si256(r2, r3, 0x31); p0 = _mm256_adds_epi16(p0, r0); p1 = _mm256_adds_epi16(p1, r1); @@ -1126,8 +1121,8 @@ void uavs3d_recon_chroma_w16x_avx2(s16 *resi_u, s16 *resi_v, pel *pred, int widt r1 = _mm256_loadu_si256((const __m256i*)(resi_v + j)); r2 = _mm256_unpacklo_epi16(zero, r1); // UV interlaced: uv0-uv4 uv8-uv12 r3 = _mm256_unpackhi_epi16(zero, r1); - r0 = _mm256_insertf128_si256(r2, _mm256_castsi256_si128(r3), 0x1); // uv0-uv8 - r1 = _mm256_insertf128_si256(r3, _mm256_extracti128_si256(r2, 1), 0x0); + r0 = _mm256_permute2x128_si256(r2, r3, 0x20); // uv0-uv8 + r1 = _mm256_permute2x128_si256(r2, r3, 0x31); p0 = _mm256_adds_epi16(p0, r0); p1 = _mm256_adds_epi16(p1, r1); diff --git a/source/decore/avx2/sao_avx2.c b/source/decore/avx2/sao_avx2.c index 73c01b4..7d2d527 100644 --- a/source/decore/avx2/sao_avx2.c +++ b/source/decore/avx2/sao_avx2.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_def.h b/source/decore/com_def.h index a8e9446..8b7ad27 100644 --- a/source/decore/com_def.h +++ b/source/decore/com_def.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -40,14 +35,18 @@ #include "com_sys.h" -#define BIT_DEPTH 8 - #define CHECK_RAND_STRM 0 -#if (BIT_DEPTH == 8) -typedef u8 pel; /* pixel type */ +#ifndef COMPILE_10BIT +#define COMPILE_10BIT 0 +#endif + +#if COMPILE_10BIT +typedef unsigned short pel; /* pixel type */ +#define BIT_DEPTH 10 #else -typedef s16 pel; /* pixel type */ +typedef unsigned char pel; /* pixel type */ +#define BIT_DEPTH 8 #endif /************************* profile & level **********************************************/ diff --git a/source/decore/com_sys.h b/source/decore/com_sys.h index 0cb4359..2ea3237 100644 --- a/source/decore/com_sys.h +++ b/source/decore/com_sys.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_table.c b/source/decore/com_table.c index bbc40d9..2042e6d 100644 --- a/source/decore/com_table.c +++ b/source/decore/com_table.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_table.h b/source/decore/com_table.h index 2c2bb3d..c419405 100644 --- a/source/decore/com_table.h +++ b/source/decore/com_table.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_type.h b/source/decore/com_type.h index 5bb8337..0a7db50 100644 --- a/source/decore/com_type.h +++ b/source/decore/com_type.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_util.c b/source/decore/com_util.c index 7e374c9..353804b 100644 --- a/source/decore/com_util.c +++ b/source/decore/com_util.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/com_util.h b/source/decore/com_util.h index b6887bb..721a0c1 100644 --- a/source/decore/com_util.h +++ b/source/decore/com_util.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -41,10 +36,10 @@ #include "com_type.h" /* function selection define based on platforms */ -#if (defined(__ANDROID__) && defined(__aarch64__)) || (defined(__APPLE__) && defined(__arm64__)) +#if defined(_arm64) || (defined(__APPLE__) && defined(__arm64__)) #define ENABLE_FUNCTION_C 1 #define ENABLE_FUNCTION_ARM64 1 -#elif (defined(__ANDROID__) && defined(__arm__)) || (defined(__APPLE__) && defined(__ARM_NEON__)) +#elif defined(_armv7a) || (defined(__APPLE__) && defined(__ARM_NEON__)) #define ENABLE_FUNCTION_C 1 #define ENABLE_FUNCTION_ARM32 1 #elif (defined(__WIN32__) || defined(_WIN32)) || (defined(__MACOSX__) || defined(macintosh) || defined(__linux__) || defined(__unix__)) && (defined(__i386__) || defined(__x86_64__) || defined(__AMD64__)) diff --git a/source/decore/deblock.c b/source/decore/deblock.c index 0996c57..f9764d6 100644 --- a/source/decore/deblock.c +++ b/source/decore/deblock.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -65,7 +60,7 @@ static int uavs3d_always_inline skip_filter(com_map_t *map, com_ref_pic_t refp[M { com_scu_t MbQ = map->map_scu[scup + offset]; com_pic_t *q_pic0, *q_pic1; - const com_scu_t mask = {0, 0, 0, 0, 1, 0, 0}; + const com_scu_t mask = {0, 1, 0, 0, 1, 0, 0}; if ((*(u8*)&MbQ) & (*(u8*)&mask)) { return 0; @@ -210,7 +205,7 @@ void com_deblock_set_edge(com_core_t *core) int scu_x = core->cu_pix_x >> MIN_CU_LOG2; int scu_y = core->cu_pix_y >> MIN_CU_LOG2; const int grad_mask = (LOOPFILTER_GRID >> 2) - 1; - const com_scu_t mask = { 0, 0, 0, 0, 1, 0, 0 }; + const com_scu_t mask = { 0, 1, 0, 0, 1, 0, 0 }; com_scu_t scu = map->map_scu[scup]; if ((*(u8*)&scu) & (*(u8*)&mask)) { diff --git a/source/decore/inter_pred.c b/source/decore/inter_pred.c index c53d399..470c84c 100644 --- a/source/decore/inter_pred.c +++ b/source/decore/inter_pred.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -486,10 +481,10 @@ void uavs3d_always_inline com_affine_mc_chroma(com_core_t *core, pel *dstc, int int max_posx = (seqhdr->pic_width + 4) >> 1; int max_posy = (seqhdr->pic_height + 4) >> 1; int i_asb_mv = cu_width >> 2; - s32(*asb_mv1)[MV_D] = asb_mv0 + i_asb_mv; int i_src = ref_pic->stride_chroma; if (sub_blk_size == 4) { + s32(*asb_mv1)[MV_D] = asb_mv0 + i_asb_mv; for (h = 0; h < cu_height; h += 8) { int base_y = (y + h) << 4; for (w = 0; w < cu_width; w += 8, asb_mv0 += 2, asb_mv1 += 2) { diff --git a/source/decore/intra_pred.c b/source/decore/intra_pred.c index de1eef6..c767be0 100644 --- a/source/decore/intra_pred.c +++ b/source/decore/intra_pred.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -320,7 +315,7 @@ void ipred_plane(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) int ib_shift[5] = { 7, 10, 11, 15, 19 }; int idx_w = g_tbl_log2[w] - 2; int idx_h = g_tbl_log2[h] - 2; - int im_h, is_h, im_v, is_v, temp, temp2; + int im_h, is_h, im_v, is_v, temp; int max_pel = (1 << bit_depth) - 1; int val; @@ -343,7 +338,7 @@ void ipred_plane(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) temp = a - (h2 - 1) * c - (w2 - 1) * b + 16; for (y = 0; y < h; y++) { - temp2 = temp; + int temp2 = temp; for (x = 0; x < w; x++) { val = temp2 >> 5; dst[x] = (pel)COM_CLIP3(0, max_pel, val); @@ -368,7 +363,7 @@ void ipred_plane_ipf(pel *src, s16 *dst, int w, int h) int ib_shift[5] = { 7, 10, 11, 15, 19 }; int idx_w = g_tbl_log2[w] - 2; int idx_h = g_tbl_log2[h] - 2; - int im_h, is_h, im_v, is_v, temp, temp2; + int im_h, is_h, im_v, is_v, temp; im_h = ib_mult[idx_w]; is_h = ib_shift[idx_w]; im_v = ib_mult[idx_h]; @@ -388,7 +383,7 @@ void ipred_plane_ipf(pel *src, s16 *dst, int w, int h) temp = a - (h2 - 1) * c - (w2 - 1) * b + 16; for (y = 0; y < h; y++) { - temp2 = temp; + int temp2 = temp; for (x = 0; x < w; x++) { dst[x] = (s16)(temp2 >> 5); temp2 += b; @@ -416,7 +411,7 @@ void ipred_plane_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) int idx_w = g_tbl_log2[w] - 2; int idx_h = g_tbl_log2[h] - 2; int im_h, is_h, im_v, is_v; - int temp_u, temp_v, temp2_u, temp2_v; + int temp_u, temp_v; int max_pel = (1 << bit_depth) - 1; int val_u, val_v; @@ -448,8 +443,8 @@ void ipred_plane_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) temp_v = a_v - (h2 - 1) * c_v - (w2 - 1) * b_v + 16; for (y = 0; y < h; y++) { - temp2_u = temp_u; - temp2_v = temp_v; + int temp2_u = temp_u; + int temp2_v = temp_v; for (x = 0; x < width2; x += 2) { val_u = temp2_u >> 5; val_v = temp2_v >> 5; @@ -475,8 +470,7 @@ void ipred_bi(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) int ishift = COM_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); - int a, b, c, wt, wxy, tmp; - int predx; + int a, b, c, wt, tmp; int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE]; int wc, tbl_wc[6] = {-1, 21, 13, 7, 4, 2}; int max_pel = (1 << bit_depth) - 1; @@ -510,8 +504,8 @@ void ipred_bi(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) tmp += wt; } for( y = 0; y < h; y++ ) { - predx = ref_le[y]; - wxy = 0; + int predx = ref_le[y]; + int wxy = 0; for( x = 0; x < w; x++ ) { predx += le[y]; ref_up[x] += up[x]; @@ -534,8 +528,7 @@ void ipred_bi_ipf(pel *src, s16 *dst, int w, int h) int ishift = COM_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); - int a, b, c, wt, wxy, tmp; - int predx; + int a, b, c, wt, tmp; int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE]; int wc, tbl_wc[6] = { -1, 21, 13, 7, 4, 2 }; wc = ishift_x > ishift_y ? ishift_x - ishift_y : ishift_y - ishift_x; @@ -566,8 +559,8 @@ void ipred_bi_ipf(pel *src, s16 *dst, int w, int h) tmp += wt; } for (y = 0; y < h; y++) { - predx = ref_le[y]; - wxy = 0; + int predx = ref_le[y]; + int wxy = 0; for (x = 0; x < w; x++) { predx += le[y]; ref_up[x] += up[x]; @@ -589,9 +582,8 @@ void ipred_bi_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) int ishift = COM_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); - int a_u, b_u, c_u, wt_u, wxy_u, tmp_u; - int a_v, b_v, c_v, wt_v, wxy_v, tmp_v; - int predx_u, predx_v; + int a_u, b_u, c_u, wt_u, tmp_u; + int a_v, b_v, c_v, wt_v, tmp_v; int ref_up[MAX_CU_SIZE], ref_le[MAX_CU_SIZE], up[MAX_CU_SIZE], le[MAX_CU_SIZE], wy[MAX_CU_SIZE]; int wc, tbl_wc[6] = { -1, 21, 13, 7, 4, 2 }; int w2 = w << 1; @@ -640,9 +632,10 @@ void ipred_bi_uv(pel *src, pel *dst, int i_dst, int w, int h, int bit_depth) } for (y = 0; y < h; y++) { int y2 = y << 1; - predx_u = ref_le[y2 ]; - predx_v = ref_le[y2 + 1]; - wxy_u = wxy_v = 0; + int predx_u = ref_le[y2 ]; + int predx_v = ref_le[y2 + 1]; + int wxy_u = 0; + int wxy_v = 0; for (x = 0; x < w2; x += 2) { predx_u += le[y2]; predx_v += le[y2 + 1]; @@ -1034,7 +1027,6 @@ static void uavs3d_always_inline ipf_core_s16(pel *src, pel *dst, int i_dst, s16 s32 filter_idx_ver = (s32)g_tbl_log2[h] - 2; //Block Size s32 ver_filter_range = COM_MIN(h, 10); s32 hor_filter_range = COM_MIN(w, 10); - int max_val = (1 << bit_depth) - 1; // TODO: g_ipf_pred_param doesn't support 128 if (filter_idx_hor > 4) { @@ -1300,7 +1292,6 @@ static void xPredIntraAngAdi_X_8(pel *pSrc, pel *dst, int i_dst, int uiDirMode, int line_size = iWidth + iHeight / 2 - 1; int real_size = min(line_size, iWidth * 2 + 1); int i; - int pad1, pad2; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel *pfirst[2] = { first_line, first_line + aligned_line_size }; @@ -1311,6 +1302,8 @@ static void xPredIntraAngAdi_X_8(pel *pSrc, pel *dst, int i_dst, int uiDirMode, // padding if (real_size < line_size) { + int pad1, pad2; + pfirst[1][real_size - 1] = pfirst[1][real_size - 2]; pad1 = pfirst[0][real_size - 1]; @@ -1466,7 +1459,6 @@ static void xPredIntraAngAdi_Y_28(pel *pSrc, pel *dst, int i_dst, int uiDirMode, int real_size = min(line_size, iHeight * 4 + 1); int i; int iHeight2 = iHeight << 1; - int pad1, pad2; for (i = 0; i < real_size; i += 2, pSrc--) { first_line[i] = (pSrc[0] + (pSrc[-1] + pSrc[-2]) * 3 + pSrc[-3] + 4) >> 3; @@ -1475,6 +1467,7 @@ static void xPredIntraAngAdi_Y_28(pel *pSrc, pel *dst, int i_dst, int uiDirMode, // padding if (real_size < line_size) { + int pad1, pad2; first_line[i - 1] = first_line[i - 3]; pad1 = first_line[i - 2]; diff --git a/source/decore/inv_trans.c b/source/decore/inv_trans.c index 2be533a..2d60b20 100644 --- a/source/decore/inv_trans.c +++ b/source/decore/inv_trans.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/modules.h b/source/decore/modules.h index 026d237..00d65b1 100644 --- a/source/decore/modules.h +++ b/source/decore/modules.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/pic_manager.c b/source/decore/pic_manager.c index 1c9be09..1a09a38 100644 --- a/source/decore/pic_manager.c +++ b/source/decore/pic_manager.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/recon.c b/source/decore/recon.c index c6466ba..951957a 100644 --- a/source/decore/recon.c +++ b/source/decore/recon.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sao.c b/source/decore/sao.c index b39466d..9004046 100644 --- a/source/decore/sao.c +++ b/source/decore/sao.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/alf_sse.c b/source/decore/sse/alf_sse.c index 2880605..24e203e 100644 --- a/source/decore/sse/alf_sse.c +++ b/source/decore/sse/alf_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/deblock_sse.c b/source/decore/sse/deblock_sse.c index 271c2fc..ab88636 100644 --- a/source/decore/sse/deblock_sse.c +++ b/source/decore/sse/deblock_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/inter_pred_sse.c b/source/decore/sse/inter_pred_sse.c index 170d079..7faf0d7 100644 --- a/source/decore/sse/inter_pred_sse.c +++ b/source/decore/sse/inter_pred_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -55,25 +50,59 @@ void uavs3d_if_cpy_sse(const pel *src, int i_src, pel *dst, int i_dst, int width void uavs3d_if_cpy_w4_sse(const pel *src, int i_src, pel *dst, int i_dst, int width, int height) { - while (height) { + if (height < 4) { CP32(dst, src); CP32(dst + i_dst, src + i_src); - height -= 2; - src += i_src << 1; - dst += i_dst << 1; + } + else { + int i_src2 = i_src << 1; + int i_dst2 = i_dst << 1; + int i_src3 = i_src + i_src2; + int i_dst3 = i_dst + i_dst2; + int i_src4 = i_src << 2; + int i_dst4 = i_dst << 2; + while (height > 0) { + CP32(dst, src); + CP32(dst + i_dst, src + i_src); + CP32(dst + i_dst2, src + i_src2); + CP32(dst + i_dst3, src + i_src3); + height -= 4; + src += i_src4; + dst += i_dst4; + } } } void uavs3d_if_cpy_w8_sse(const pel *src, int i_src, pel *dst, int i_dst, int width, int height) { - int i_src2 = i_src << 1; - int i_dst2 = i_dst << 1; - while (height) { - CP64(dst, src); - CP64(dst + i_dst, src + i_src); - src += i_src2; - dst += i_dst2; - height -= 2; + if (height < 4) { + __m128i m0, m1; + m0 = _mm_loadl_epi64((const __m128i*)src); + m1 = _mm_loadl_epi64((const __m128i*)(src + i_src)); + _mm_storel_epi64((__m128i*)dst, m0); + _mm_storel_epi64((__m128i*)(dst + i_dst), m1); + } else { + __m128i m0, m1, m2, m3; + int i_src2 = i_src << 1; + int i_dst2 = i_dst << 1; + int i_src3 = i_src + i_src2; + int i_dst3 = i_dst + i_dst2; + int i_src4 = i_src << 2; + int i_dst4 = i_dst << 2; + while (height) { + m0 = _mm_loadl_epi64((const __m128i*)src); + m1 = _mm_loadl_epi64((const __m128i*)(src + i_src)); + m2 = _mm_loadl_epi64((const __m128i*)(src + i_src2)); + m3 = _mm_loadl_epi64((const __m128i*)(src + i_src3)); + height -= 4; + src += i_src4; + + _mm_storel_epi64((__m128i*)dst, m0); + _mm_storel_epi64((__m128i*)(dst + i_dst), m1); + _mm_storel_epi64((__m128i*)(dst + i_dst2), m2); + _mm_storel_epi64((__m128i*)(dst + i_dst3), m3); + dst += i_dst4; + } } } @@ -371,7 +400,7 @@ void uavs3d_if_hor_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_dst, i height -= 2; _mm_storel_epi64((__m128i*)dst, T0); - M64(dst + i_dst) = _mm_extract_epi64(T0, 1); + _mm_storeh_pi((__m64*)(dst + i_dst), _mm_castsi128_ps(T0)); src += i_src << 1; dst += i_dst << 1; @@ -512,7 +541,6 @@ void uavs3d_if_ver_chroma_w16x_sse(const pel *src, int i_src, pel *dst, int i_ds const int offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); - pel const *p; __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff); __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2)); __m128i mVal1, mVal2; @@ -520,7 +548,7 @@ void uavs3d_if_ver_chroma_w16x_sse(const pel *src, int i_src, pel *dst, int i_ds src -= i_src; while (height--) { - p = src; + pel const *p = src; uavs3d_prefetch(src + 4 * i_src, _MM_HINT_NTA); for (col = 0; col < width; col += 16) { __m128i T01 = _mm_loadu_si128((__m128i*)(p)); @@ -739,7 +767,6 @@ void uavs3d_if_ver_luma_w16x_sse(const pel *src, int i_src, pel *dst, int i_dst, int col; const int offset = 32; const int shift = 6; - pel const *p; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i coeff0 = _mm_set1_epi16(*(s16*)coeff); __m128i coeff1 = _mm_set1_epi16(*(s16*)(coeff + 2)); @@ -750,7 +777,7 @@ void uavs3d_if_ver_luma_w16x_sse(const pel *src, int i_src, pel *dst, int i_dst, src -= 3 * i_src; while (height--) { - p = src; + pel const *p = src; uavs3d_prefetch(src + 8 * i_src, _MM_HINT_NTA); for (col = 0; col < width; col += 16) { __m128i T01 = _mm_loadu_si128((__m128i*)(p)); @@ -811,7 +838,6 @@ void uavs3d_if_hor_ver_chroma_w8x_sse(const pel *src, int i_src, pel *dst, int i { int row, col; int shift; - s16 const *p; ALIGNED_16(s16 tmp_res[(64 + 3) * 64*2]); s16 *tmp = tmp_res; const int i_tmp = width; @@ -856,7 +882,7 @@ void uavs3d_if_hor_ver_chroma_w8x_sse(const pel *src, int i_src, pel *dst, int i coeff1_ver = _mm_cvtepi8_epi16(coeff1_ver); while (height--) { - p = tmp; + s16 const *p = tmp; for (col = 0; col < width; col += 8) { __m128i T00 = _mm_load_si128((__m128i*)(p)); __m128i T10 = _mm_load_si128((__m128i*)(p + i_tmp)); @@ -1411,7 +1437,6 @@ void uavs3d_if_hor_ver_luma_w8x_sse(const pel *src, int i_src, pel *dst, int i_d { int row, col; int shift; - s16 const *p; ALIGNED_16(s16 tmp_res[(128 + 7) * 128]); s16 *tmp = tmp_res; @@ -1473,7 +1498,7 @@ void uavs3d_if_hor_ver_luma_w8x_sse(const pel *src, int i_src, pel *dst, int i_d mCoefy4_ver = _mm_cvtepi8_epi16(mCoefy4_ver); while (height--) { - p = tmp; + s16 const *p = tmp; for (col = 0; col < width; col += 8) { __m128i T00 = _mm_load_si128((__m128i*)(p)); __m128i T10 = _mm_load_si128((__m128i*)(p + i_tmp)); @@ -2315,14 +2340,13 @@ void uavs3d_if_hor_ver_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_ds int rows; int add1, shift1; int add2, shift2; - __m128i T0, T1, T2, T3, T4, T5, T6, T7; + __m128i T0, T1, T2, T3, T4, T5; __m128i M0, M1, M2, M3, M4, M5, M6, M7; - __m128i N0, N1, N2, N3, N4, N5, N6, N7; - __m128i mCoef, offset, max_pel; + __m128i offset, max_pel; ALIGNED_16(s16 tmp_res[(64 + 7) * 8]); s16 *tmp = tmp_res; const int i_tmp = 8; - __m128i coeff00, coeff01, coeff02, coeff03; + __m128i mCoef0, mCoef1, mCoef2, mCoef3; if (max_val == 255) { // 8 bit_depth shift1 = 0; @@ -2338,110 +2362,129 @@ void uavs3d_if_hor_ver_luma_w8_sse(const pel *src, int i_src, pel *dst, int i_ds src += -3 * i_src - 3; - mCoef = _mm_loadl_epi64((__m128i*)coef_x); - offset = _mm_set1_epi32(add1); - mCoef = _mm_cvtepi8_epi16(mCoef); - - // HOR - rows = height + 7; - while (rows--) { - uavs3d_prefetch(src + i_src, _MM_HINT_NTA); - T0 = _mm_loadu_si128((__m128i*)(src + 0)); - T1 = _mm_loadu_si128((__m128i*)(src + 1)); - T2 = _mm_loadu_si128((__m128i*)(src + 2)); - T3 = _mm_loadu_si128((__m128i*)(src + 3)); - T4 = _mm_loadu_si128((__m128i*)(src + 4)); - T5 = _mm_loadu_si128((__m128i*)(src + 5)); - T6 = _mm_loadu_si128((__m128i*)(src + 6)); - T7 = _mm_loadu_si128((__m128i*)(src + 7)); - - M0 = _mm_madd_epi16(T0, mCoef); - M1 = _mm_madd_epi16(T1, mCoef); - M2 = _mm_madd_epi16(T2, mCoef); - M3 = _mm_madd_epi16(T3, mCoef); - M4 = _mm_madd_epi16(T4, mCoef); - M5 = _mm_madd_epi16(T5, mCoef); - M6 = _mm_madd_epi16(T6, mCoef); - M7 = _mm_madd_epi16(T7, mCoef); + { + __m128i mShuffle0 = _mm_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m128i mShuffle1 = _mm_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + __m128i S0, S1, S2; + + mCoef0 = _mm_set1_epi16(((s16*)coef_x)[0]); + mCoef1 = _mm_set1_epi16(((s16*)coef_x)[1]); + mCoef2 = _mm_set1_epi16(((s16*)coef_x)[2]); + mCoef3 = _mm_set1_epi16(((s16*)coef_x)[3]); + mCoef0 = _mm_cvtepi8_epi16(mCoef0); + mCoef1 = _mm_cvtepi8_epi16(mCoef1); + mCoef2 = _mm_cvtepi8_epi16(mCoef2); + mCoef3 = _mm_cvtepi8_epi16(mCoef3); + offset = _mm_set1_epi32(add1); + + // HOR + rows = height + 7; + while (rows--) { + S0 = _mm_loadu_si128((__m128i*)(src)); + S1 = _mm_loadu_si128((__m128i*)(src + 4)); + S2 = _mm_loadu_si128((__m128i*)(src + 8)); + src += i_src; + uavs3d_prefetch(src, _MM_HINT_NTA); + + T0 = _mm_shuffle_epi8(S0, mShuffle0); + T1 = _mm_shuffle_epi8(S0, mShuffle1); + T2 = _mm_shuffle_epi8(S1, mShuffle0); + T3 = _mm_shuffle_epi8(S1, mShuffle1); + T4 = _mm_shuffle_epi8(S2, mShuffle0); + T5 = _mm_shuffle_epi8(S2, mShuffle1); + + M0 = _mm_madd_epi16(T0, mCoef0); + M1 = _mm_madd_epi16(T1, mCoef1); + M2 = _mm_madd_epi16(T2, mCoef2); + M3 = _mm_madd_epi16(T3, mCoef3); + M4 = _mm_madd_epi16(T2, mCoef0); + M5 = _mm_madd_epi16(T3, mCoef1); + M6 = _mm_madd_epi16(T4, mCoef2); + M7 = _mm_madd_epi16(T5, mCoef3); - M0 = _mm_hadd_epi32(M0, M1); - M1 = _mm_hadd_epi32(M2, M3); - M2 = _mm_hadd_epi32(M4, M5); - M3 = _mm_hadd_epi32(M6, M7); + M0 = _mm_add_epi32(M0, M1); + M1 = _mm_add_epi32(M2, M3); + M2 = _mm_add_epi32(M4, M5); + M3 = _mm_add_epi32(M6, M7); - M0 = _mm_hadd_epi32(M0, M1); - M1 = _mm_hadd_epi32(M2, M3); + M0 = _mm_add_epi32(M0, M1); + M1 = _mm_add_epi32(M2, M3); - M2 = _mm_add_epi32(M0, offset); - M3 = _mm_add_epi32(M1, offset); - M2 = _mm_srai_epi32(M2, shift1); - M3 = _mm_srai_epi32(M3, shift1); - M2 = _mm_packs_epi32(M2, M3); - _mm_storeu_si128((__m128i*)tmp, M2); + M2 = _mm_add_epi32(M0, offset); + M3 = _mm_add_epi32(M1, offset); + M2 = _mm_srai_epi32(M2, shift1); + M3 = _mm_srai_epi32(M3, shift1); + M2 = _mm_packs_epi32(M2, M3); + _mm_store_si128((__m128i*)tmp, M2); - tmp += i_tmp; - src += i_src; + tmp += i_tmp; + } } - offset = _mm_set1_epi32(add2); - max_pel = _mm_set1_epi16((pel)max_val); - tmp = tmp_res; - - coeff00 = _mm_set1_epi16(*(s16*)coef_y); - coeff01 = _mm_set1_epi16(*(s16*)(coef_y + 2)); - coeff02 = _mm_set1_epi16(*(s16*)(coef_y + 4)); - coeff03 = _mm_set1_epi16(*(s16*)(coef_y + 6)); - coeff00 = _mm_cvtepi8_epi16(coeff00); - coeff01 = _mm_cvtepi8_epi16(coeff01); - coeff02 = _mm_cvtepi8_epi16(coeff02); - coeff03 = _mm_cvtepi8_epi16(coeff03); - - while (height--) { - T0 = _mm_load_si128((__m128i*)(tmp)); - T1 = _mm_load_si128((__m128i*)(tmp + i_tmp)); - T2 = _mm_load_si128((__m128i*)(tmp + 2 * i_tmp)); - T3 = _mm_load_si128((__m128i*)(tmp + 3 * i_tmp)); - T4 = _mm_load_si128((__m128i*)(tmp + 4 * i_tmp)); - T5 = _mm_load_si128((__m128i*)(tmp + 5 * i_tmp)); - T6 = _mm_load_si128((__m128i*)(tmp + 6 * i_tmp)); - T7 = _mm_load_si128((__m128i*)(tmp + 7 * i_tmp)); + { + __m128i N0, N1, N2, N3, N4, N5, N6, N7; + __m128i T6, T7; + + offset = _mm_set1_epi32(add2); + max_pel = _mm_set1_epi16((pel)max_val); + tmp = tmp_res; + + mCoef0 = _mm_set1_epi16(((s16*)coef_y)[0]); + mCoef1 = _mm_set1_epi16(((s16*)coef_y)[1]); + mCoef2 = _mm_set1_epi16(((s16*)coef_y)[2]); + mCoef3 = _mm_set1_epi16(((s16*)coef_y)[3]); + mCoef0 = _mm_cvtepi8_epi16(mCoef0); + mCoef1 = _mm_cvtepi8_epi16(mCoef1); + mCoef2 = _mm_cvtepi8_epi16(mCoef2); + mCoef3 = _mm_cvtepi8_epi16(mCoef3); + + while (height--) { + T0 = _mm_load_si128((__m128i*)(tmp)); + T1 = _mm_load_si128((__m128i*)(tmp + i_tmp)); + T2 = _mm_load_si128((__m128i*)(tmp + 2 * i_tmp)); + T3 = _mm_load_si128((__m128i*)(tmp + 3 * i_tmp)); + T4 = _mm_load_si128((__m128i*)(tmp + 4 * i_tmp)); + T5 = _mm_load_si128((__m128i*)(tmp + 5 * i_tmp)); + T6 = _mm_load_si128((__m128i*)(tmp + 6 * i_tmp)); + T7 = _mm_load_si128((__m128i*)(tmp + 7 * i_tmp)); - M0 = _mm_unpacklo_epi16(T0, T1); - M1 = _mm_unpacklo_epi16(T2, T3); - M2 = _mm_unpacklo_epi16(T4, T5); - M3 = _mm_unpacklo_epi16(T6, T7); - M4 = _mm_unpackhi_epi16(T0, T1); - M5 = _mm_unpackhi_epi16(T2, T3); - M6 = _mm_unpackhi_epi16(T4, T5); - M7 = _mm_unpackhi_epi16(T6, T7); + M0 = _mm_unpacklo_epi16(T0, T1); + M1 = _mm_unpacklo_epi16(T2, T3); + M2 = _mm_unpacklo_epi16(T4, T5); + M3 = _mm_unpacklo_epi16(T6, T7); + M4 = _mm_unpackhi_epi16(T0, T1); + M5 = _mm_unpackhi_epi16(T2, T3); + M6 = _mm_unpackhi_epi16(T4, T5); + M7 = _mm_unpackhi_epi16(T6, T7); - N0 = _mm_madd_epi16(M0, coeff00); - N1 = _mm_madd_epi16(M1, coeff01); - N2 = _mm_madd_epi16(M2, coeff02); - N3 = _mm_madd_epi16(M3, coeff03); - N4 = _mm_madd_epi16(M4, coeff00); - N5 = _mm_madd_epi16(M5, coeff01); - N6 = _mm_madd_epi16(M6, coeff02); - N7 = _mm_madd_epi16(M7, coeff03); + N0 = _mm_madd_epi16(M0, mCoef0); + N1 = _mm_madd_epi16(M1, mCoef1); + N2 = _mm_madd_epi16(M2, mCoef2); + N3 = _mm_madd_epi16(M3, mCoef3); + N4 = _mm_madd_epi16(M4, mCoef0); + N5 = _mm_madd_epi16(M5, mCoef1); + N6 = _mm_madd_epi16(M6, mCoef2); + N7 = _mm_madd_epi16(M7, mCoef3); - N0 = _mm_add_epi32(N0, N1); - N1 = _mm_add_epi32(N2, N3); - N2 = _mm_add_epi32(N4, N5); - N3 = _mm_add_epi32(N6, N7); + N0 = _mm_add_epi32(N0, N1); + N1 = _mm_add_epi32(N2, N3); + N2 = _mm_add_epi32(N4, N5); + N3 = _mm_add_epi32(N6, N7); - N0 = _mm_add_epi32(N0, N1); - N1 = _mm_add_epi32(N2, N3); + N0 = _mm_add_epi32(N0, N1); + N1 = _mm_add_epi32(N2, N3); - N0 = _mm_add_epi32(N0, offset); - N1 = _mm_add_epi32(N1, offset); - N0 = _mm_srai_epi32(N0, shift2); - N1 = _mm_srai_epi32(N1, shift2); - N0 = _mm_packus_epi32(N0, N1); - N0 = _mm_min_epu16(N0, max_pel); - _mm_storeu_si128((__m128i*)(dst), N0); + N0 = _mm_add_epi32(N0, offset); + N1 = _mm_add_epi32(N1, offset); + N0 = _mm_srai_epi32(N0, shift2); + N1 = _mm_srai_epi32(N1, shift2); + N0 = _mm_packus_epi32(N0, N1); + N0 = _mm_min_epu16(N0, max_pel); + _mm_storeu_si128((__m128i*)(dst), N0); - dst += i_dst; - tmp += i_tmp; + dst += i_dst; + tmp += i_tmp; + } } } diff --git a/source/decore/sse/intra_pred_sse.c b/source/decore/sse/intra_pred_sse.c index b877834..d77b556 100644 --- a/source/decore/sse/intra_pred_sse.c +++ b/source/decore/sse/intra_pred_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -713,13 +708,14 @@ void uavs3d_ipred_chroma_hor_sse(pel *src, pel *dst, int i_dst, int width, int h void uavs3d_ipred_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth) { - int i, x, y; + int x, y; int dc; pel *p_src = src - 1; int left_avail = IS_AVAIL(avail_cu, AVAIL_LE); int above_avail = IS_AVAIL(avail_cu, AVAIL_UP); if (left_avail && above_avail) { + int i; int length = width + height + 1; __m128i sum = _mm_setzero_si128(); __m128i val; @@ -828,7 +824,7 @@ void uavs3d_ipred_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u void uavs3d_ipred_chroma_dc_sse(pel *src, pel *dst, int i_dst, int width, int height, u16 avail_cu, int bit_depth) { __m128i T; - int i, x, y; + int x, y; int dcU, dcV; pel *p_src = src - 2; int left_avail = IS_AVAIL(avail_cu, AVAIL_LE); @@ -838,6 +834,7 @@ void uavs3d_ipred_chroma_dc_sse(pel *src, pel *dst, int i_dst, int width, int he int height2 = height << 1; int wh = width + height; int length = (wh << 1) + 2; // 2*(width + height + 1) + int i; __m128i sum = _mm_setzero_si128(); __m128i val; @@ -1787,7 +1784,6 @@ void uavs3d_ipred_ipf_s16_sse(pel *src, pel *dst, int i_dst, s16* pred, int flt_ { pel *p_top = src + 1; int row; - int max_val = (1 << bit_depth) - 1; __m128i c_32 = _mm_set1_epi16(32); __m128i zero = _mm_setzero_si128(); if (w == 4) { diff --git a/source/decore/sse/itrans_sse.c b/source/decore/sse/itrans_sse.c index f7a5051..217e88e 100644 --- a/source/decore/sse/itrans_sse.c +++ b/source/decore/sse/itrans_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/pixel_sse.c b/source/decore/sse/pixel_sse.c index 46ce33f..804b71d 100644 --- a/source/decore/sse/pixel_sse.c +++ b/source/decore/sse/pixel_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/sao_sse.c b/source/decore/sse/sao_sse.c index 3459b3e..5f4723b 100644 --- a/source/decore/sse/sao_sse.c +++ b/source/decore/sse/sao_sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/sse.c b/source/decore/sse/sse.c index cb8c119..570edf5 100644 --- a/source/decore/sse/sse.c +++ b/source/decore/sse/sse.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/source/decore/sse/sse.h b/source/decore/sse/sse.h index 967808a..4e10ab7 100644 --- a/source/decore/sse/sse.h +++ b/source/decore/sse/sse.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -45,12 +40,18 @@ #include "modules.h" -#ifdef _WIN32 - -#ifndef _WIN64 -#define _mm_extract_epi64(a, i) (a.m128i_i64[i]) -#endif - +#if __x86_64__ +#elif __i386__ && !defined(_mm_extract_epi64) +#define _mm_extract_epi64 _mm_extract_epi64 +#include +static inline int64_t _mm_extract_epi64(__m128i a, const int imm8) { + return imm8 ? ((int64_t)_mm_extract_epi16(a, 7) << 48) | + ((int64_t)_mm_extract_epi16(a, 6) << 32) | + (_mm_extract_epi16(a, 5) << 16) | _mm_extract_epi16(a, 4) + : ((int64_t)_mm_extract_epi16(a, 3) << 48) | + ((int64_t)_mm_extract_epi16(a, 2) << 32) | + (_mm_extract_epi16(a, 1) << 16) | _mm_extract_epi16(a, 0); +} #endif ALIGNED_32(extern pel uavs3d_simd_mask[15][16]); diff --git a/source/decore/threadpool.h b/source/decore/threadpool.h index 3370beb..6a74bac 100644 --- a/source/decore/threadpool.h +++ b/source/decore/threadpool.h @@ -11,9 +11,6 @@ typedef volatile long atom_t; // 32 bits, signed #if defined(_WIN32) #include "win32thread.h" #else - -#pragma comment(lib, "pthreadVC2.lib") - #include #define uavs3d_pthread_t pthread_t #define uavs3d_pthread_create pthread_create diff --git a/test/utest.c b/test/utest.c index 724c7d8..e4df366 100644 --- a/test/utest.c +++ b/test/utest.c @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. @@ -202,8 +197,8 @@ static int app_img_write(char * fname, uavs3d_io_frm_t * img, com_seqh_t *seqhdr return -1; } for (i = 0; i < 3; i++) { - int hor_size = seqhdr->horizontal_size >> (i ? 1 : 0); - int ver_size = seqhdr->vertical_size >> (i ? 1 : 0); + int hor_size = seqhdr->display_horizontal_size >> (i ? 1 : 0); + int ver_size = seqhdr->display_vertical_size >> (i ? 1 : 0); unsigned char * p8 = (unsigned char *)img->buffer[i]; for (j = 0; j < ver_size; j++) { @@ -412,10 +407,13 @@ void output_callback(uavs3d_io_frm_t *frm) { } #if defined(__APPLE__) && (defined(__arm64__) || defined(__ARM_NEON__)) -int uavs3d_decode_sample(int argc, const char **argv) -#else -int main(int argc, const char **argv) +#include +#if !TARGET_OS_OSX +#define main uavs3d_decode_sample +#endif #endif + +int main(int argc, const char **argv) { int decoding = 1; unsigned char * bs_buf = NULL; @@ -465,7 +463,7 @@ int main(int argc, const char **argv) dec_cfg.log_level = 1; dec_cfg.frm_threads = 1; - if (argc < 2) { + if ((argc < 2) || !(argc % 2)) { log_level_0("Error config, please check arguments: \n"); print_help(); return -1; @@ -588,8 +586,8 @@ finished: if (frame_num) { width = dec_frame.width[0]; height = dec_frame.height[0]; - hor_size = dec_frame.seqhdr->horizontal_size; - ver_size = dec_frame.seqhdr->vertical_size; + hor_size = dec_frame.seqhdr->display_horizontal_size; + ver_size = dec_frame.seqhdr->display_vertical_size; log_level_1("=========================================================================================\n"); log_level_1(" Resolution = %d x %d (Coding: %d x %d)\n", hor_size, ver_size, width, height); diff --git a/test/utest.h b/test/utest.h index aceb8a4..1dbd239 100644 --- a/test/utest.h +++ b/test/utest.h @@ -1,5 +1,5 @@ /************************************************************************************** - * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School", + * Copyright (c) 2018-2022 ["Peking University Shenzhen Graduate School", * "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"] * * All rights reserved. @@ -11,12 +11,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes the software uAVS3d developed by - * Peking University Shenzhen Graduate School, Peng Cheng Laboratory - * and Guangdong Bohua UHD Innovation Corporation. - * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School, + * 3. Neither the name of the organizations (Peking University Shenzhen Graduate School, * Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. diff --git a/version.sh b/version.sh index 65e2df7..034d454 100755 --- a/version.sh +++ b/version.sh @@ -15,12 +15,12 @@ else shell_dir=$1 fi -VER_R=`git rev-list origin/master | sort | wc -l | gawk '{print $1}'` -VER_L=`git rev-list HEAD | sort | wc -l | gawk '{print $1}'` +VER_R=`git rev-list origin/master | sort | wc -l | awk '{print $1}'` +VER_L=`git rev-list HEAD | sort | wc -l | awk '{print $1}'` VER_SHA1=`git log -n 1 | head -n 1 | cut -d ' ' -f 2` major_version="1" -minor_version="1" +minor_version="2" type_version="release" # generate the file version.h