//! This file implements the two TLS variants [1] used by ELF-based systems. Note that, in reality,
//! Variant I has two sub-variants.
//!
//! It is important to understand that the term TCB (Thread Control Block) is overloaded here.
//! Official ABI documentation uses it simply to mean the ABI TCB, i.e. a small area of ABI-defined
//! data, usually one or two words (see the `AbiTcb` type below). People will also often use TCB to
//! refer to the libc TCB, which can be any size and contain anything. (One could even omit it!) We
//! refer to the latter as the Zig TCB; see the `ZigTcb` type below.
//!
//! [1] https://www.akkadia.org/drepper/tls.pdf

const std = @import("std");
const mem = std.mem;
const elf = std.elf;
const math = std.math;
const assert = std.debug.assert;
const native_arch = @import("builtin").cpu.arch;
const linux = std.os.linux;
const posix = std.posix;
const page_size_min = std.heap.page_size_min;

/// Represents an ELF TLS variant.
///
/// In all variants, the TP and the TLS blocks must be aligned to the `p_align` value in the
/// `PT_TLS` ELF program header. Everything else has natural alignment.
///
/// The location of the DTV does not actually matter. For simplicity, we put it in the TLS area, but
/// there is no actual ABI requirement that it reside there.
const Variant = enum {
    /// The original Variant I:
    ///
    /// ----------------------------------------
    /// | DTV | Zig TCB | ABI TCB | TLS Blocks |
    /// ----------------^-----------------------
    ///                 `-- The TP register points here.
    ///
    /// The layout in this variant necessitates separate alignment of both the TP and the TLS
    /// blocks.
    ///
    /// The first word in the ABI TCB points to the DTV. For some architectures, there may be a
    /// second word with an unspecified meaning.
    I_original,
    /// The modified Variant I:
    ///
    /// ---------------------------------------------------
    /// | DTV | Zig TCB | ABI TCB | [Offset] | TLS Blocks |
    /// -------------------------------------^-------------
    ///                                      `-- The TP register points here.
    ///
    /// The offset (which can be zero) is applied to the TP only; there is never physical gap
    /// between the ABI TCB and the TLS blocks. This implies that we only need to align the TP.
    ///
    /// The first (and only) word in the ABI TCB points to the DTV.
    I_modified,
    /// Variant II:
    ///
    /// ----------------------------------------
    /// | TLS Blocks | ABI TCB | Zig TCB | DTV |
    /// -------------^--------------------------
    ///              `-- The TP register points here.
    ///
    /// The first (and only) word in the ABI TCB points to the ABI TCB itself.
    II,
};

const current_variant: Variant = switch (native_arch) {
    .arc,
    .arm,
    .armeb,
    .aarch64,
    .aarch64_be,
    .csky,
    .thumb,
    .thumbeb,
    => .I_original,
    .loongarch32,
    .loongarch64,
    .m68k,
    .mips,
    .mipsel,
    .mips64,
    .mips64el,
    .powerpc,
    .powerpcle,
    .powerpc64,
    .powerpc64le,
    .riscv32,
    .riscv64,
    => .I_modified,
    .hexagon,
    .s390x,
    .sparc,
    .sparc64,
    .x86,
    .x86_64,
    => .II,
    else => @compileError("undefined TLS variant for this architecture"),
};

/// The Offset value for the modified Variant I.
const current_tp_offset = switch (native_arch) {
    .m68k,
    .mips,
    .mipsel,
    .mips64,
    .mips64el,
    .powerpc,
    .powerpcle,
    .powerpc64,
    .powerpc64le,
    => 0x7000,
    else => 0,
};

/// Usually only used by the modified Variant I.
const current_dtv_offset = switch (native_arch) {
    .m68k,
    .mips,
    .mipsel,
    .mips64,
    .mips64el,
    .powerpc,
    .powerpcle,
    .powerpc64,
    .powerpc64le,
    => 0x8000,
    .riscv32,
    .riscv64,
    => 0x800,
    else => 0,
};

/// Per-thread storage for the ELF TLS ABI.
const AbiTcb = switch (current_variant) {
    .I_original, .I_modified => switch (native_arch) {
        // ARM EABI mandates enough space for two pointers: the first one points to the DTV as
        // usual, while the second one is unspecified.
        .aarch64,
        .aarch64_be,
        .arm,
        .armeb,
        .thumb,
        .thumbeb,
        => extern struct {
            /// This is offset by `current_dtv_offset`.
            dtv: usize,
            reserved: ?*anyopaque,
        },
        else => extern struct {
            /// This is offset by `current_dtv_offset`.
            dtv: usize,
        },
    },
    .II => extern struct {
        /// This is self-referential.
        self: *AbiTcb,
    },
};

/// Per-thread storage for Zig's use. Currently unused.
const ZigTcb = struct {
    dummy: usize,
};

/// Dynamic Thread Vector as specified in the ELF TLS ABI. Ordinarily, there is a block pointer per
/// dynamically-loaded module, but since we only support static TLS, we only need one block pointer.
const Dtv = extern struct {
    len: usize = 1,
    tls_block: [*]u8,
};

/// Describes a process's TLS area. The area encompasses the DTV, both TCBs, and the TLS block, with
/// the exact layout of these being dependent primarily on `current_variant`.
const AreaDesc = struct {
    size: usize,
    alignment: usize,

    dtv: struct {
        /// Offset into the TLS area.
        offset: usize,
    },

    abi_tcb: struct {
        /// Offset into the TLS area.
        offset: usize,
    },

    block: struct {
        /// The initial data to be copied into the TLS block. Note that this may be smaller than
        /// `size`, in which case any remaining data in the TLS block is simply left uninitialized.
        init: []const u8,
        /// Offset into the TLS area.
        offset: usize,
        /// This is the effective size of the TLS block, which may be greater than `init.len`.
        size: usize,
    },

    /// Only used on the 32-bit x86 architecture (not x86_64, nor x32).
    gdt_entry_number: usize,
};

pub var area_desc: AreaDesc = undefined;

pub fn setThreadPointer(addr: usize) void {
    @setRuntimeSafety(false);
    @disableInstrumentation();

    switch (native_arch) {
        .x86 => {
            var user_desc: linux.user_desc = .{
                .entry_number = area_desc.gdt_entry_number,
                .base_addr = addr,
                .limit = 0xfffff,
                .flags = .{
                    .seg_32bit = 1,
                    .contents = 0, // Data
                    .read_exec_only = 0,
                    .limit_in_pages = 1,
                    .seg_not_present = 0,
                    .useable = 1,
                },
            };
            const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, @intFromPtr(&user_desc) });
            assert(rc == 0);

            const gdt_entry_number = user_desc.entry_number;
            // We have to keep track of our slot as it's also needed for clone()
            area_desc.gdt_entry_number = gdt_entry_number;
            // Update the %gs selector
            asm volatile ("movl %[gs_val], %%gs"
                :
                : [gs_val] "r" (gdt_entry_number << 3 | 3),
            );
        },
        .x86_64 => {
            const rc = @call(.always_inline, linux.syscall2, .{ .arch_prctl, linux.ARCH.SET_FS, addr });
            assert(rc == 0);
        },
        .aarch64, .aarch64_be => {
            asm volatile (
                \\ msr tpidr_el0, %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .arc => {
            // We apparently need to both set r25 (TP) *and* inform the kernel...
            asm volatile (
                \\ mov r25, %[addr]
                :
                : [addr] "r" (addr),
            );
            const rc = @call(.always_inline, linux.syscall1, .{ .arc_settls, addr });
            assert(rc == 0);
        },
        .arm, .armeb, .thumb, .thumbeb => {
            const rc = @call(.always_inline, linux.syscall1, .{ .set_tls, addr });
            assert(rc == 0);
        },
        .m68k => {
            const rc = linux.syscall1(.set_thread_area, addr);
            assert(rc == 0);
        },
        .hexagon => {
            asm volatile (
                \\ ugp = %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .loongarch32, .loongarch64 => {
            asm volatile (
                \\ move $tp, %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .riscv32, .riscv64 => {
            asm volatile (
                \\ mv tp, %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .csky, .mips, .mipsel, .mips64, .mips64el => {
            const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, addr });
            assert(rc == 0);
        },
        .powerpc, .powerpcle => {
            asm volatile (
                \\ mr 2, %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .powerpc64, .powerpc64le => {
            asm volatile (
                \\ mr 13, %[addr]
                :
                : [addr] "r" (addr),
            );
        },
        .s390x => {
            asm volatile (
                \\ lgr %%r0, %[addr]
                \\ sar %%a1, %%r0
                \\ srlg %%r0, %%r0, 32
                \\ sar %%a0, %%r0
                :
                : [addr] "r" (addr),
                : "r0"
            );
        },
        .sparc, .sparc64 => {
            asm volatile (
                \\ mov %[addr], %%g7
                :
                : [addr] "r" (addr),
            );
        },
        else => @compileError("Unsupported architecture"),
    }
}

fn computeAreaDesc(phdrs: []elf.Phdr) void {
    @setRuntimeSafety(false);
    @disableInstrumentation();

    var tls_phdr: ?*elf.Phdr = null;
    var img_base: usize = 0;

    for (phdrs) |*phdr| {
        switch (phdr.p_type) {
            elf.PT_PHDR => img_base = @intFromPtr(phdrs.ptr) - phdr.p_vaddr,
            elf.PT_TLS => tls_phdr = phdr,
            else => {},
        }
    }

    var align_factor: usize = undefined;
    var block_init: []const u8 = undefined;
    var block_size: usize = undefined;

    if (tls_phdr) |phdr| {
        align_factor = phdr.p_align;

        // The effective size in memory is represented by `p_memsz`; the length of the data stored
        // in the `PT_TLS` segment is `p_filesz` and may be less than the former.
        block_init = @as([*]u8, @ptrFromInt(img_base + phdr.p_vaddr))[0..phdr.p_filesz];
        block_size = phdr.p_memsz;
    } else {
        align_factor = @alignOf(usize);

        block_init = &[_]u8{};
        block_size = 0;
    }

    // Offsets into the allocated TLS area.
    var dtv_offset: usize = undefined;
    var abi_tcb_offset: usize = undefined;
    var block_offset: usize = undefined;

    // Compute the total size of the ABI-specific data plus our own `ZigTcb` structure. All the
    // offsets calculated here assume a well-aligned base address.
    const area_size = switch (current_variant) {
        .I_original => blk: {
            var l: usize = 0;
            dtv_offset = l;
            l += @sizeOf(Dtv);
            // Add some padding here so that the TP (`abi_tcb_offset`) is aligned to `align_factor`
            // and the `ZigTcb` structure can be found by simply subtracting `@sizeOf(ZigTcb)` from
            // the TP.
            const delta = (l + @sizeOf(ZigTcb)) & (align_factor - 1);
            if (delta > 0)
                l += align_factor - delta;
            l += @sizeOf(ZigTcb);
            abi_tcb_offset = l;
            l += alignForward(@sizeOf(AbiTcb), align_factor);
            block_offset = l;
            l += block_size;
            break :blk l;
        },
        .I_modified => blk: {
            var l: usize = 0;
            dtv_offset = l;
            l += @sizeOf(Dtv);
            // In this variant, the TLS blocks must begin immediately after the end of the ABI TCB,
            // with the TP pointing to the beginning of the TLS blocks. Add padding so that the TP
            // (`abi_tcb_offset`) is aligned to `align_factor` and the `ZigTcb` structure can be
            // found by subtracting `@sizeOf(AbiTcb) + @sizeOf(ZigTcb)` from the TP.
            const delta = (l + @sizeOf(ZigTcb) + @sizeOf(AbiTcb)) & (align_factor - 1);
            if (delta > 0)
                l += align_factor - delta;
            l += @sizeOf(ZigTcb);
            abi_tcb_offset = l;
            l += @sizeOf(AbiTcb);
            block_offset = l;
            l += block_size;
            break :blk l;
        },
        .II => blk: {
            var l: usize = 0;
            block_offset = l;
            l += alignForward(block_size, align_factor);
            // The TP is aligned to `align_factor`.
            abi_tcb_offset = l;
            l += @sizeOf(AbiTcb);
            // The `ZigTcb` structure is right after the `AbiTcb` with no padding in between so it
            // can be easily found.
            l += @sizeOf(ZigTcb);
            // It doesn't really matter where we put the DTV, so give it natural alignment.
            l = alignForward(l, @alignOf(Dtv));
            dtv_offset = l;
            l += @sizeOf(Dtv);
            break :blk l;
        },
    };

    area_desc = .{
        .size = area_size,
        .alignment = align_factor,

        .dtv = .{
            .offset = dtv_offset,
        },

        .abi_tcb = .{
            .offset = abi_tcb_offset,
        },

        .block = .{
            .init = block_init,
            .offset = block_offset,
            .size = block_size,
        },

        .gdt_entry_number = @as(usize, @bitCast(@as(isize, -1))),
    };
}

/// Inline because TLS is not set up yet.
inline fn alignForward(addr: usize, alignment: usize) usize {
    return alignBackward(addr + (alignment - 1), alignment);
}

/// Inline because TLS is not set up yet.
inline fn alignBackward(addr: usize, alignment: usize) usize {
    return addr & ~(alignment - 1);
}

/// Inline because TLS is not set up yet.
inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
    return @ptrCast(@alignCast(ptr));
}

/// Initializes all the fields of the static TLS area and returns the computed architecture-specific
/// value of the TP register.
pub fn prepareArea(area: []u8) usize {
    @setRuntimeSafety(false);
    @disableInstrumentation();

    // Clear the area we're going to use, just to be safe.
    @memset(area, 0);

    // Prepare the ABI TCB.
    const abi_tcb = alignPtrCast(AbiTcb, area.ptr + area_desc.abi_tcb.offset);
    switch (current_variant) {
        .I_original, .I_modified => abi_tcb.dtv = @intFromPtr(area.ptr + area_desc.dtv.offset),
        .II => abi_tcb.self = abi_tcb,
    }

    // Prepare the DTV.
    const dtv = alignPtrCast(Dtv, area.ptr + area_desc.dtv.offset);
    dtv.len = 1;
    dtv.tls_block = area.ptr + current_dtv_offset + area_desc.block.offset;

    // Copy the initial data.
    @memcpy(area[area_desc.block.offset..][0..area_desc.block.init.len], area_desc.block.init);

    // Return the corrected value (if needed) for the TP register. Overflow here is not a problem;
    // the pointer arithmetic involving the TP is done with wrapping semantics.
    return @intFromPtr(area.ptr) +% switch (current_variant) {
        .I_original, .II => area_desc.abi_tcb.offset,
        .I_modified => area_desc.block.offset +% current_tp_offset,
    };
}

/// The main motivation for the size chosen here is that this is how much ends up being requested for
/// the thread-local variables of the `std.crypto.random` implementation. I'm not sure why it ends up
/// being so much; the struct itself is only 64 bytes. I think it has to do with being page-aligned
/// and LLVM or LLD is not smart enough to lay out the TLS data in a space-conserving way. Anyway, I
/// think it's fine because it's less than 3 pages of memory, and putting it in the ELF like this is
/// equivalent to moving the `mmap` call below into the kernel, avoiding syscall overhead.
var main_thread_area_buffer: [0x2100]u8 align(page_size_min) = undefined;

/// Computes the layout of the static TLS area, allocates the area, initializes all of its fields,
/// and assigns the architecture-specific value to the TP register.
pub fn initStatic(phdrs: []elf.Phdr) void {
    @setRuntimeSafety(false);
    @disableInstrumentation();

    computeAreaDesc(phdrs);

    const area = blk: {
        // Fast path for the common case where the TLS data is really small, avoid an allocation and
        // use our local buffer.
        if (area_desc.alignment <= page_size_min and area_desc.size <= main_thread_area_buffer.len) {
            break :blk main_thread_area_buffer[0..area_desc.size];
        }

        const begin_addr = mmap_tls(area_desc.size + area_desc.alignment - 1);
        if (@call(.always_inline, linux.E.init, .{begin_addr}) != .SUCCESS) @trap();

        const area_ptr: [*]align(page_size_min) u8 = @ptrFromInt(begin_addr);

        // Make sure the slice is correctly aligned.
        const begin_aligned_addr = alignForward(begin_addr, area_desc.alignment);
        const start = begin_aligned_addr - begin_addr;
        break :blk area_ptr[start..][0..area_desc.size];
    };

    const tp_value = prepareArea(area);
    setThreadPointer(tp_value);
}

inline fn mmap_tls(length: usize) usize {
    const prot = posix.PROT.READ | posix.PROT.WRITE;
    const flags: linux.MAP = .{ .TYPE = .PRIVATE, .ANONYMOUS = true };

    if (@hasField(linux.SYS, "mmap2")) {
        return @call(.always_inline, linux.syscall6, .{
            .mmap2,
            0,
            length,
            prot,
            @as(u32, @bitCast(flags)),
            @as(usize, @bitCast(@as(isize, -1))),
            0,
        });
    } else {
        // The s390x mmap() syscall existed before Linux supported syscalls with 5+ parameters, so
        // it takes a single pointer to an array of arguments instead.
        return if (native_arch == .s390x) @call(.always_inline, linux.syscall1, .{
            .mmap,
            @intFromPtr(&[_]usize{
                0,
                length,
                prot,
                @as(u32, @bitCast(flags)),
                @as(usize, @bitCast(@as(isize, -1))),
                0,
            }),
        }) else @call(.always_inline, linux.syscall6, .{
            .mmap,
            0,
            length,
            prot,
            @as(u32, @bitCast(flags)),
            @as(usize, @bitCast(@as(isize, -1))),
            0,
        });
    }
}