# StringZilla CMakeLists.txt
#
# This file defines several library build & installation targets:
#
# * stringzilla_header: A header-only library with the StringZilla C and C++ headers.
# * stringzilla_shared: A shared library with the StringZilla C and C++ headers and dynamic SIMD dispatch.
# * stringzilla_bare: A shared library with the StringZilla headers, but without linking the standard C library.
# * stringzillas_cpus_shared: A shared library with the StringZillas parallel algorithms for multi-threaded CPUs.
# * stringzillas_cuda_shared: A shared library with the StringZillas parallel algorithms for CUDA-capable GPUs.
# * stringzillas_rocm_shared: A shared library with the StringZillas parallel algorithms for ROCm-capable GPUs.
#
# Tests for different C++ standards:
#
# * stringzilla_test_cpp11: C++11 baseline support.
# * stringzilla_test_cpp14: C++14 support with `std::less<std::string>`-like function objects.
# * stringzilla_test_cpp17: C++17 support with `std::string_view` compatibility.
# * stringzilla_test_cpp20: C++20 support with `<=>` operator and more `constexpr` features.
#
# Tests for different SIMD architectures:
#
# * stringzilla_test_cpp20_serial: A test executable for serial execution.
# * stringzilla_test_cpp20_westmere: A test executable for SSE4.2.
# * stringzilla_test_cpp20_haswell: A test executable for AVX2.
# * stringzilla_test_cpp20_ice: A test executable for AVX-512.
# * stringzilla_test_cpp20_neon: A test executable for ARM Neon.
# * stringzilla_test_cpp20_sve: A test executable for ARM Scalable Vector Extension.
#
# Serial Benchmarks:
#
# * stringzilla_bench_find_cpp20: A benchmark for substring search operations.
# * stringzilla_bench_sequence_cpp20: A benchmark for string array-level operations.
# * stringzilla_bench_token_cpp20: A benchmark for comparators and hash functions.
# * stringzilla_bench_container_cpp20: A benchmark for STL containers powered by StringZilla.
# * stringzilla_bench_memory_cpp20: A benchmark for LibC-style low-level memory operations.
#
# Parallel Benchmarks:
#
# * stringzillas_bench_similarities_cpp20: A benchmark for similarity operations.
# * stringzillas_bench_similarities_cu20: A benchmark for similarity operations on GPU.
# * stringzillas_bench_fingerprints_cpp20: A benchmark for finding many substrings.
# * stringzillas_bench_fingerprints_cu20: A benchmark for finding many substrings on GPU.
#
# For higher-level language bindings separate build scripts are provided, native to each toolchain.
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(
    stringzilla
    VERSION 4.0.15
    LANGUAGES C CXX ASM
    DESCRIPTION "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU"
    HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla"
)

set(CMAKE_C_STANDARD 99)
set(CMAKE_CXX_STANDARD 11)

set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_COMPILE_WARNING_AS_ERROR)
set(DEV_USER_NAME $ENV{USER})

message(STATUS "C Compiler ID: ${CMAKE_C_COMPILER_ID}")
message(STATUS "C Compiler Version: ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "C Compiler: ${CMAKE_C_COMPILER}")
message(STATUS "C++ Compiler ID: ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "C++ Compiler Version: ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")

# Detect CUDA Support
set(STRINGZILLA_CAN_BUILD_CUDA OFF)
include(CheckLanguage)
check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
    set(STRINGZILLA_CAN_BUILD_CUDA ON)
    message(STATUS "CUDA compiler available")
else ()
    message(STATUS "CUDA compiler not available")
endif ()

if (CMAKE_SIZEOF_VOID_P EQUAL 8)
    message(STATUS "Pointer size: 64-bit")
else ()
    message(STATUS "Pointer size: 32-bit")
endif ()

# Set a default build type to "Release" if none was specified
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
    message(STATUS "Setting build type to 'Release' as none was specified.")
    set(CMAKE_BUILD_TYPE
        Release
        CACHE STRING "Choose the type of build." FORCE
    )
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif ()
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")

if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
    set(SZ_PLATFORM_X86 TRUE)
    message(STATUS "Platform: x86")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64")
    set(SZ_PLATFORM_ARM TRUE)
    message(STATUS "Platform: ARM")
endif ()

# Determine if StringZilla is built as a sub-project (using `add_subdirectory`) or if it is the main project
set(STRINGZILLA_IS_MAIN_PROJECT OFF)

if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
    set(STRINGZILLA_IS_MAIN_PROJECT ON)
endif ()

# Installation options
option(STRINGZILLA_INSTALL "Install CMake targets" OFF)
option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
option(STRINGZILLA_BUILD_SHARED "Compile a dynamic library" ${STRINGZILLA_IS_MAIN_PROJECT})
option(STRINGZILLAS_BUILD_SHARED "Compile dynamic parallel libraries" ${STRINGZILLA_IS_MAIN_PROJECT})
option(STRINGZILLA_BUILD_CUDA "Build CUDA-accelerated targets" ${STRINGZILLA_CAN_BUILD_CUDA})
option(STRINGZILLA_USE_SANITIZERS "Enable AddressSanitizer and UndefinedBehaviorSanitizer in Debug builds" ON)
set(STRINGZILLA_TARGET_ARCH
    ""
    CACHE STRING "Architecture to tell the compiler to optimize for (-march)"
)

# Enable CUDA if requested
if (STRINGZILLA_BUILD_CUDA)
    if (NOT STRINGZILLA_CAN_BUILD_CUDA)
        message(FATAL_ERROR "CUDA support requested but CUDA compiler not found")
    endif ()
    enable_language(CUDA)
    set(CMAKE_CUDA_STANDARD 20)
    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
    set(CMAKE_CUDA_EXTENSIONS OFF)
    set(CMAKE_CUDA_ARCHITECTURES 90a) # Hopper is the newest architecture we specialize for
    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
    find_package(CUDAToolkit REQUIRED)
    message(STATUS "CUDA support enabled")
    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
    message(STATUS "CUDA Compiler ID: ${CMAKE_CUDA_COMPILER_ID}")
    message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
    message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
endif ()

# Includes
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
include(ExternalProject)
include(CheckCSourceCompiles)

# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory
if (POLICY CMP0077)
    cmake_policy(SET CMP0077 NEW)
endif ()

# Configuration
include(GNUInstallDirs)
set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")

if (CMAKE_VERSION VERSION_EQUAL 3.13 OR CMAKE_VERSION VERSION_GREATER 3.13)
    include(CTest)
    enable_testing()
endif ()

if (MSVC)
    # Remove /RTC* from MSVC debug flags by default (it will be added back in the set_compiler_flags function) Because
    # /RTC* cannot be used without the crt so it needs to be disabled for that specific target
    string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
    string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
endif ()

# Function to set the default compiler-specific flags
function (set_compiler_flags target cpp_standard target_arch compiler_id)
    get_target_property(target_type ${target} TYPE)

    target_include_directories(${target} PRIVATE scripts)
    target_include_directories(${target} PRIVATE fork_union/include)

    # Set output directory for single-configuration generators (like Make)
    set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/$<0:>)

    # Set output directory for multi-configuration generators (like Visual Studio)
    foreach (config IN LISTS CMAKE_CONFIGURATION_TYPES)
        string(TOUPPER ${config} config_upper)
        set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${config_upper} ${CMAKE_BINARY_DIR}/$<0:>)
    endforeach ()

    # Set the C++ standard
    if (NOT cpp_standard STREQUAL "")
        if (compiler_id STREQUAL "NVIDIA")
            set_target_properties(${target} PROPERTIES CUDA_STANDARD ${cpp_standard})
        elseif (compiler_id MATCHES "MSVC")
            # For MSVC, explicitly set the /std: flag - don't set CXX_STANDARD property to avoid conflicts
            target_compile_options(${target} PRIVATE "/std:c++${cpp_standard}")
        else ()
            set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
        endif ()
    endif ()

    # Use the `/Zc:__cplusplus` flag to correctly define the `__cplusplus` macro in MSVC
    if (compiler_id MATCHES "MSVC")
        target_compile_options(${target} PRIVATE "/Zc:__cplusplus")
    endif ()

    # Make sure CUDA C++ allows calling `constexpr` from device code
    if (compiler_id STREQUAL "NVIDIA")
        target_compile_options(${target} PRIVATE "--expt-relaxed-constexpr")
    endif ()

    # Maximum warnings level & warnings as error.
    #
    # MSVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for "unary minus operator applied to unsigned type,
    # result still unsigned". We also specify `/utf-8` to properly UTF-8 symbols in tests.
    if (compiler_id STREQUAL "GNU")
        target_compile_options(
            ${target}
            PRIVATE
                "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function;-Wno-sign-conversion"
        )
        target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
    elseif (compiler_id STREQUAL "Clang" OR compiler_id STREQUAL "AppleClang")
        target_compile_options(
            ${target}
            PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-sign-conversion"
        )
    elseif (compiler_id MATCHES "MSVC")
        target_compile_options(
            ${target}
            PRIVATE "/Bt" # Display build timings
                    "/wd4068" # Disable warning: unknown pragma
                    "/wd4146" # Disable warning: unary minus operator applied to unsigned type
                    "/wd4996" # Disable warning: 'unsafe' functions like getenv, fopen (use _s variants)
                    "/wd4244" # Disable warning: conversion with possible loss of data (e.g., float to int)
                    "/wd4267" # Disable warning: conversion from 'size_t' to smaller type, possible loss of data
                    "/utf-8" # Set source and execution character sets to UTF-8
                    "/WX" # Treat warnings as errors
        )
    elseif (compiler_id STREQUAL "NVIDIA")
        target_compile_options(
            ${target}
            PRIVATE
                "-Xcompiler=-Wfatal-errors;-Xcompiler=-Wall;-Xcompiler=-Wextra;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function"
        )
    endif ()

    # Set optimization options for different compilers differently
    if (compiler_id MATCHES "MSVC")
        if (CMAKE_BUILD_TYPE STREQUAL "Debug")
            target_compile_options(${target} PRIVATE "/Od;/Zi")
            if (NOT target_type STREQUAL "SHARED_LIBRARY")
                target_compile_options(${target} PRIVATE "/RTC1")
            endif ()
        elseif (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
            target_compile_options(${target} PRIVATE "/O2;/Zi")
        endif ()
    elseif (
        compiler_id STREQUAL "GNU"
        OR compiler_id STREQUAL "Clang"
        OR compiler_id STREQUAL "AppleClang"
    )
        if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
            target_compile_options(${target} PRIVATE "-O0;-g")
        endif ()
        if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
            target_compile_options(${target} PRIVATE "-O2")
        endif ()
    elseif (compiler_id STREQUAL "NVIDIA")
        target_compile_options(
            ${target} PRIVATE "-Xcompiler=-Wall" # All warnings (host)
                              "-Xcompiler=-Wextra" # Extra warnings (host)
        )

        if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
            target_compile_options(
                ${target}
                PRIVATE "-G" # Device debug symbols, which will add `-lineinfo` symbols to PTX
                        "-no-compress" # No compression of debug info
                        "-Xcompiler=-g" # Host debugging symbols explicitly
                        "-Xcompiler=-fno-omit-frame-pointer" # Stack trace clarity
                        "-Xcompiler=-fno-inline" # Prevent host inlining
                        "-maxrregcount=0" # No register count limits
            )
        endif ()
        if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
            target_compile_options(
                ${target}
                PRIVATE "-O2" # Disable NVCC optimizations explicitly
                        "-Xptxas=-O2" # Disable PTX assembler optimizations
                        "-Xcompiler=-O2" # Host optimizations off
            )
        endif ()
    endif ()

    # If available, enable Position Independent Code
    get_target_property(target_pic ${target} POSITION_INDEPENDENT_CODE)
    if (target_pic)
        target_compile_definitions(${target} PRIVATE "SZ_PIC")
    endif ()

    # Avoid builtin functions where we know what we are doing.
    if (compiler_id MATCHES "MSVC")
        target_compile_options(${target} PRIVATE "/Oi-")
    else ()
        target_compile_options(${target} PRIVATE "-fno-builtin-memcmp")
        target_compile_options(${target} PRIVATE "-fno-builtin-memchr")
        target_compile_options(${target} PRIVATE "-fno-builtin-memcpy")
        target_compile_options(${target} PRIVATE "-fno-builtin-memset")
    endif ()

    # Check for ${target_arch} and set it or use the current system if not defined
    if ("${target_arch}" STREQUAL "")
        # Only use the current system if we are not cross compiling
        if ((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
            if (compiler_id STREQUAL "NVIDIA")
                # For NVCC, pass native flag to host compiler
                include(CheckCXXCompilerFlag)
                check_cxx_compiler_flag("-march=native" supports_march_native)
                if (supports_march_native)
                    target_compile_options(${target} PRIVATE "-Xcompiler=-march=native")
                endif ()
            elseif (NOT (compiler_id MATCHES "MSVC"))
                include(CheckCXXCompilerFlag)
                check_cxx_compiler_flag("-march=native" supports_march_native)
                if (supports_march_native)
                    target_compile_options(${target} PRIVATE "-march=native")
                endif ()
            else ()
                # MSVC does not have a direct equivalent to -march=native
                target_compile_options(${target} PRIVATE "/arch:AVX2")
            endif ()
        endif ()
    else ()
        if (compiler_id MATCHES "MSVC")
            target_compile_options(${target} PRIVATE "/arch:${target_arch}")
        elseif (compiler_id STREQUAL "NVIDIA")
            # NVCC handles CPU architecture through host compiler flags
            target_compile_options(${target} PRIVATE "-Xcompiler=-march=${target_arch}")
        else ()
            target_compile_options(${target} PRIVATE "-march=${target_arch}")
        endif ()
    endif ()

    # Define SZ_IS_BIG_ENDIAN_ macro based on system byte order
    if (CMAKE_C_BYTE_ORDER STREQUAL "BIG_ENDIAN")
        set(SZ_IS_BIG_ENDIAN_ 1)
    else ()
        set(SZ_IS_BIG_ENDIAN_ 0)
    endif ()

    target_compile_definitions(${target} PRIVATE "SZ_IS_BIG_ENDIAN_=${SZ_IS_BIG_ENDIAN_}")

    # Sanitizer options for Debug mode
    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=1")
        if (STRINGZILLA_USE_SANITIZERS AND NOT target_type STREQUAL "SHARED_LIBRARY")
            if (compiler_id MATCHES "MSVC")
                target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
                target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
            elseif (compiler_id STREQUAL "NVIDIA")
                # ! NVCC can't handle sanitizers?!
                # https://stackoverflow.com/questions/75590579/cuda-fails-to-initialise-when-address-sanitizer-is-enabled
            else ()
                target_compile_options(${target} PRIVATE "-fsanitize=address" "-fsanitize=undefined")
                target_link_options(${target} PRIVATE "-fsanitize=address" "-fsanitize=undefined")
            endif ()
        endif ()
    else ()
        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")
    endif ()
endfunction ()

function (define_launcher exec_name source cpp_standard target_arch)
    add_executable(${exec_name})
    target_sources(${exec_name} PRIVATE ${source})
    set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}" "${CMAKE_CXX_COMPILER_ID}")
    target_link_libraries(${exec_name} PRIVATE stringzilla_header)
    add_test(NAME ${exec_name} COMMAND ${exec_name})
endfunction ()

function (define_gpu_launcher exec_name source cuda_standard target_arch)
    add_executable(${exec_name})
    target_sources(${exec_name} PRIVATE ${source})
    set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name} PROPERTIES LANGUAGE CUDA)
    target_compile_definitions(${exec_name} PRIVATE "SZ_USE_CUDA=1")
    set_target_properties(${exec_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
    target_include_directories(${exec_name} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
    set_compiler_flags(${exec_name} ${cuda_standard} "${target_arch}" "${CMAKE_CUDA_COMPILER_ID}")
    target_link_libraries(${exec_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
    # Only targeting Ampere and Hopper architectures for now
    set_property(TARGET ${exec_name} PROPERTY CUDA_ARCHITECTURES 80 90)
    target_link_libraries(${exec_name} PRIVATE stringzilla_header)
    add_test(NAME ${exec_name} COMMAND ${exec_name})
endfunction ()

if (STRINGZILLA_BUILD_BENCHMARK)
    define_launcher(stringzilla_bench_find_cpp20 scripts/bench_find.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_bench_sequence_cpp20 scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_bench_token_cpp20 scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_bench_container_cpp20 scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_bench_memory_cpp20 scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")

    # Parallel benchmarks
    define_launcher(
        stringzillas_bench_similarities_cpp20 scripts/bench_similarities.cpp 20 "${STRINGZILLA_TARGET_ARCH}"
    )
    define_launcher(
        stringzillas_bench_fingerprints_cpp20 scripts/bench_fingerprints.cpp 20 "${STRINGZILLA_TARGET_ARCH}"
    )
    if (STRINGZILLA_BUILD_CUDA)
        define_gpu_launcher(
            stringzillas_bench_similarities_cu20 scripts/bench_similarities.cu 20 "${STRINGZILLA_TARGET_ARCH}"
        )
        define_gpu_launcher(
            stringzillas_bench_fingerprints_cu20 scripts/bench_fingerprints.cu 20 "${STRINGZILLA_TARGET_ARCH}"
        )
    endif ()
endif ()

if (STRINGZILLA_BUILD_TEST)
    # Make sure that the compilation passes for different C++ standards!
    #
    # Keep in mind, MSVC only supports C++11 and newer.
    define_launcher(stringzilla_test_cpp11 scripts/test_stringzilla.cpp 11 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_test_cpp14 scripts/test_stringzilla.cpp 14 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_test_cpp17 scripts/test_stringzilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzilla_test_cpp20 scripts/test_stringzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")

    # Test parallel algorithms separately
    define_launcher(stringzillas_test_cpp17 scripts/test_stringzillas.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
    define_launcher(stringzillas_test_cpp20 scripts/test_stringzillas.cpp 20 "${STRINGZILLA_TARGET_ARCH}")

    # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
    # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
    # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
    if (STRINGZILLA_BUILD_CUDA)
        define_gpu_launcher(stringzillas_test_cu17 scripts/test_stringzillas.cu 17 "${STRINGZILLA_TARGET_ARCH}")
        define_gpu_launcher(stringzillas_test_cu20 scripts/test_stringzillas.cu 20 "${STRINGZILLA_TARGET_ARCH}")
    endif ()

    # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
    # all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
    if (SZ_PLATFORM_X86)
        # x86 specific backends
        if (MSVC)
            define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "AVX")
            define_launcher(stringzilla_test_cpp20_westmere scripts/test_stringzilla.cpp 20 "SSE4.2")
            define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "AVX2")
            define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "AVX512")
            if (STRINGZILLA_BUILD_CUDA)
                define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "AVX")
                define_gpu_launcher(stringzillas_test_cu20_westmere scripts/test_stringzillas.cu 20 "SSE4.2")
                define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "AVX2")
                define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "AVX512")
            endif ()
        else ()
            define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "ivybridge")
            define_launcher(stringzilla_test_cpp20_westmere scripts/test_stringzilla.cpp 20 "westmere")
            define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "haswell")
            define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "sapphirerapids")
            if (STRINGZILLA_BUILD_CUDA)
                define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "ivybridge")
                define_gpu_launcher(stringzillas_test_cu20_westmere scripts/test_stringzillas.cu 20 "westmere")
                define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "haswell")
                define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "sapphirerapids")
            endif ()
        endif ()
    elseif (SZ_PLATFORM_ARM)
        # ARM specific backends
        define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "armv8-a")
        define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
        # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
        if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin")
            define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
        endif ()
        if (STRINGZILLA_BUILD_CUDA)
            define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
            define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
            # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
            if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin")
                define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
            endif ()
        endif ()
    endif ()
endif ()

# Define our libraries, first the header-only version
add_library(stringzilla_header INTERFACE)
add_library(${PROJECT_NAME}::stringzilla_header ALIAS stringzilla_header)
target_include_directories(
    stringzilla_header INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}> $<INSTALL_INTERFACE:include>
)

if (STRINGZILLA_BUILD_SHARED)

    function (define_shared target)
        add_library(${target} SHARED c/stringzilla.c)
        add_library(${PROJECT_NAME}::${target} ALIAS ${target})

        set_target_properties(
            ${target}
            PROPERTIES VERSION ${PROJECT_VERSION}
                       SOVERSION 1
                       POSITION_INDEPENDENT_CODE ON
        )

        if (SZ_PLATFORM_X86)
            if (MSVC)
                set_compiler_flags(${target} "" "SSE2" "${CMAKE_CXX_COMPILER_ID}")
            else ()
                set_compiler_flags(${target} "" "ivybridge" "${CMAKE_CXX_COMPILER_ID}")
            endif ()

            target_compile_definitions(
                ${target} PRIVATE "SZ_USE_WESTMERE=1" "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1"
                                  "SZ_USE_NEON=0" "SZ_USE_SVE=0" "SZ_USE_SVE2=0"
            )
        elseif (SZ_PLATFORM_ARM)
            set_compiler_flags(${target} "" "armv8-a" "${CMAKE_CXX_COMPILER_ID}")

            target_compile_definitions(
                ${target} PRIVATE "SZ_USE_WESTMERE=0" "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1"
                                  "SZ_USE_SVE=1" "SZ_USE_SVE2=1"
            )
        endif ()

        if (MSVC)
            # Add dependencies for necessary runtime libraries in case of static linking. This ensures that basic
            # runtime functions are available:
            #
            # * msvcrt.lib: Microsoft Visual C Runtime, required for basic C runtime functions on Windows.
            # * vcruntime.lib: Microsoft Visual C++ Runtime library for basic runtime functions.
            # * ucrt.lib: Universal C Runtime, necessary for linking basic C functions like I/O.
            target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
        endif ()

    endfunction ()

    define_shared(stringzilla_shared)
    target_compile_definitions(stringzilla_shared PRIVATE "SZ_AVOID_LIBC=0")
    target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
    target_include_directories(stringzilla_shared PUBLIC include)

    # Try compiling a version without linking the LibC ! This is only for Linux, as on modern Arm-based MacOS machines !
    # We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. Also exclude MSVC builds as
    # they have linker issues with bare builds.
    if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
        define_shared(stringzilla_bare)
        target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
        target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
        target_include_directories(stringzilla_bare PUBLIC include)

        # Avoid built-ins on GCC and Clang compilers
        target_compile_options(stringzilla_bare PRIVATE "-fno-builtin;-nostdlib")
        target_link_options(stringzilla_bare PRIVATE "-nostdlib")
    endif ()
endif ()

if (STRINGZILLAS_BUILD_SHARED)
    # StringZillas shared library targets for parallel string operations
    function (define_stringzillas_shared target source_file backend_flags)
        add_library(${target} SHARED ${source_file})
        add_library(${PROJECT_NAME}::${target} ALIAS ${target})

        set_target_properties(
            ${target}
            PROPERTIES VERSION ${PROJECT_VERSION}
                       SOVERSION 1
                       POSITION_INDEPENDENT_CODE ON
        )

        target_include_directories(${target} PUBLIC include)
        target_include_directories(${target} PRIVATE fork_union/include)
        target_compile_definitions(${target} PRIVATE "SZ_DYNAMIC_DISPATCH=1")
        target_compile_definitions(${target} PRIVATE "SZ_AVOID_LIBC=0")
        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")

        # Set backend-specific compilation flags
        foreach (flag ${backend_flags})
            target_compile_definitions(${target} PRIVATE ${flag})
        endforeach ()

        # Use C++20 for StringZillas
        set_target_properties(${target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON)

        # Architecture-specific optimizations
        target_compile_options(
            ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-O3;-fPIC>" "$<$<CXX_COMPILER_ID:MSVC>:/O2>"
        )

        # Dynamic dispatch for SIMD on different architectures
        if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
            target_compile_definitions(${target} PRIVATE "SZ_IS_64BIT_X86_=1" "SZ_IS_64BIT_ARM_=0")
        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
            target_compile_definitions(${target} PRIVATE "SZ_IS_64BIT_X86_=0" "SZ_IS_64BIT_ARM_=1")
        endif ()

        # Link threading libraries for CPU backend
        find_package(Threads REQUIRED)
        target_link_libraries(${target} PRIVATE Threads::Threads)

        # Platform-specific runtime libraries (similar to define_shared)
        if (WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
        endif ()

    endfunction ()

    # Define StringZillas CPU shared library
    define_stringzillas_shared(stringzillas_cpus_shared c/stringzillas.cpp "SZ_USE_CUDA=0;SZ_USE_ROCM=0")

    # Define StringZillas CUDA shared library (only if CUDA is available)
    if (STRINGZILLA_BUILD_CUDA)
        define_stringzillas_shared(stringzillas_cuda_shared c/stringzillas.cu "SZ_USE_CUDA=1;SZ_USE_ROCM=0")

        # Link CUDA libraries
        target_link_libraries(stringzillas_cuda_shared PRIVATE CUDA::cudart)

        # Set CUDA-specific properties
        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)
        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "90a") # We dispatch manually

        # Enable CUDA separable compilation for device code
        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

        # Add CUDA-specific compiler flags
        target_compile_options(stringzillas_cuda_shared PRIVATE "--expt-relaxed-constexpr")

        # Set the source file as CUDA
        set_source_files_properties(
            c/stringzillas.cu TARGET_DIRECTORY stringzillas_cuda_shared PROPERTIES LANGUAGE CUDA
        )
    endif ()

    # TODO: Define StringZillas ROCm shared library when ROCm support is added if (ENABLE_ROCM)
    # define_stringzillas_shared(stringzillas_rocm_shared "SZ_USE_CUDA=0;SZ_USE_ROCM=1") endif ()

endif ()

if (STRINGZILLA_INSTALL)
    if (TARGET stringzilla_header)
        install(
            TARGETS stringzilla_shared
            ARCHIVE
            BUNDLE
            FRAMEWORK
            LIBRARY
            OBJECTS
            PRIVATE_HEADER
            PUBLIC_HEADER
            RESOURCE
            RUNTIME
        )
    endif ()
    if (TARGET stringzilla_bare)
        install(
            TARGETS stringzilla_bare
            ARCHIVE
            BUNDLE
            FRAMEWORK
            LIBRARY
            OBJECTS
            PRIVATE_HEADER
            PUBLIC_HEADER
            RESOURCE
            RUNTIME
        )
    endif ()

    # Install StringZillas shared libraries if they were built
    if (TARGET stringzillas_cpus_shared)
        install(
            TARGETS stringzillas_cpus_shared
            ARCHIVE
            BUNDLE
            FRAMEWORK
            LIBRARY
            OBJECTS
            PRIVATE_HEADER
            PUBLIC_HEADER
            RESOURCE
            RUNTIME
        )
    endif ()

    if (TARGET stringzillas_cuda_shared)
        install(
            TARGETS stringzillas_cuda_shared
            ARCHIVE
            BUNDLE
            FRAMEWORK
            LIBRARY
            OBJECTS
            PRIVATE_HEADER
            PUBLIC_HEADER
            RESOURCE
            RUNTIME
        )
    endif ()

    install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
    install(DIRECTORY ./c/ DESTINATION /usr/src/${PROJECT_NAME}/)
endif ()
