std: Minor changes to TLS handling

* Always allocate an info block per-thread so that libc can store
  important stuff there.
* Respect ABI-mandated alignment in more places.
* Nicer code, use slices/pointers instead of raw addresses whenever
  possible.
This commit is contained in:
LemonBoy 2020-03-25 12:08:50 +01:00 committed by Andrew Kelley
parent 12e1c6e21c
commit d788b0cd8b
4 changed files with 153 additions and 141 deletions

View file

@ -1,8 +1,9 @@
const std = @import("std"); const std = @import("std");
const builtin = std.builtin;
const os = std.os; const os = std.os;
const mem = std.mem; const mem = std.mem;
const elf = std.elf; const elf = std.elf;
const builtin = @import("builtin"); const math = std.math;
const assert = std.debug.assert; const assert = std.debug.assert;
// This file implements the two TLS variants [1] used by ELF-based systems. // This file implements the two TLS variants [1] used by ELF-based systems.
@ -57,28 +58,16 @@ const tls_tcb_size = switch (builtin.arch) {
// ARM EABI mandates enough space for two pointers: the first one points to // ARM EABI mandates enough space for two pointers: the first one points to
// the DTV while the second one is unspecified but reserved // the DTV while the second one is unspecified but reserved
.arm, .armeb, .aarch64, .aarch64_be => 2 * @sizeOf(usize), .arm, .armeb, .aarch64, .aarch64_be => 2 * @sizeOf(usize),
// One pointer-sized word that points either to the DTV or the TCB itself
else => @sizeOf(usize), else => @sizeOf(usize),
}; };
// Controls if the TCB should be aligned according to the TLS segment p_align
const tls_tcb_align_size = switch (builtin.arch) {
.arm, .armeb, .aarch64, .aarch64_be => true,
else => false,
};
// Controls if the TP points to the end of the TCB instead of its beginning // Controls if the TP points to the end of the TCB instead of its beginning
const tls_tp_points_past_tcb = switch (builtin.arch) { const tls_tp_points_past_tcb = switch (builtin.arch) {
.riscv32, .riscv64, .mipsel, .powerpc64, .powerpc64le => true, .riscv32, .riscv64, .mipsel, .powerpc64, .powerpc64le => true,
else => false, else => false,
}; };
// Check if the architecture-specific parameters look correct
comptime {
if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) {
@compileError("tls_tcb_align_size is only meaningful for variant I TLS");
}
}
// Some architectures add some offset to the tp and dtv addresses in order to // Some architectures add some offset to the tp and dtv addresses in order to
// make the generated code more efficient // make the generated code more efficient
@ -94,32 +83,36 @@ const tls_dtv_offset = switch (builtin.arch) {
}; };
// Per-thread storage for Zig's use // Per-thread storage for Zig's use
const CustomData = packed struct {}; const CustomData = struct {
dummy: usize,
};
// Dynamic Thread Vector // Dynamic Thread Vector
const DTV = packed struct { const DTV = extern struct {
entries: usize, entries: usize,
tls_block: [1]usize, tls_block: [1][*]u8,
}; };
// Holds all the information about the process TLS image // Holds all the information about the process TLS image
const TLSImage = struct { const TLSImage = struct {
data_src: []u8, init_data: []const u8,
alloc_size: usize, alloc_size: usize,
alloc_align: usize,
tcb_offset: usize, tcb_offset: usize,
dtv_offset: usize, dtv_offset: usize,
data_offset: usize, data_offset: usize,
data_size: usize,
// Only used on the i386 architecture // Only used on the i386 architecture
gdt_entry_number: usize, gdt_entry_number: usize,
}; };
pub var tls_image: ?TLSImage = null; pub var tls_image: TLSImage = undefined;
pub fn setThreadPointer(addr: usize) void { pub fn setThreadPointer(addr: usize) void {
switch (builtin.arch) { switch (builtin.arch) {
.i386 => { .i386 => {
var user_desc = std.os.linux.user_desc{ var user_desc = std.os.linux.user_desc{
.entry_number = tls_image.?.gdt_entry_number, .entry_number = tls_image.gdt_entry_number,
.base_addr = addr, .base_addr = addr,
.limit = 0xfffff, .limit = 0xfffff,
.seg_32bit = 1, .seg_32bit = 1,
@ -134,7 +127,7 @@ pub fn setThreadPointer(addr: usize) void {
const gdt_entry_number = user_desc.entry_number; const gdt_entry_number = user_desc.entry_number;
// We have to keep track of our slot as it's also needed for clone() // We have to keep track of our slot as it's also needed for clone()
tls_image.?.gdt_entry_number = gdt_entry_number; tls_image.gdt_entry_number = gdt_entry_number;
// Update the %gs selector // Update the %gs selector
asm volatile ("movl %[gs_val], %%gs" asm volatile ("movl %[gs_val], %%gs"
: :
@ -171,7 +164,7 @@ pub fn setThreadPointer(addr: usize) void {
} }
} }
pub fn initTLS() ?*elf.Phdr { fn initTLS() void {
var tls_phdr: ?*elf.Phdr = null; var tls_phdr: ?*elf.Phdr = null;
var img_base: usize = 0; var img_base: usize = 0;
@ -195,124 +188,149 @@ pub fn initTLS() ?*elf.Phdr {
// Sanity check // Sanity check
assert(at_phent == @sizeOf(elf.Phdr)); assert(at_phent == @sizeOf(elf.Phdr));
// Search the TLS section // Find the TLS section
const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum]; const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum];
var gnu_stack: ?*elf.Phdr = null;
for (phdrs) |*phdr| { for (phdrs) |*phdr| {
switch (phdr.p_type) { switch (phdr.p_type) {
elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr, elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr,
elf.PT_TLS => tls_phdr = phdr, elf.PT_TLS => tls_phdr = phdr,
elf.PT_GNU_STACK => gnu_stack = phdr, else => {},
else => continue,
} }
} }
// If the cpu is ARM-based, check if it supports the TLS register
if (comptime builtin.arch.isARM() and at_hwcap & std.os.linux.HWCAP_TLS == 0) {
// If the CPU does not support TLS via a coprocessor register,
// a kernel helper function can be used instead on certain linux kernels.
// See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c.
@panic("TODO: Implement ARM fallback TLS functionality");
}
var tls_align_factor: usize = undefined;
var tls_data: []const u8 = undefined;
var tls_data_alloc_size: usize = undefined;
if (tls_phdr) |phdr| { if (tls_phdr) |phdr| {
// If the cpu is arm-based, check if it supports the TLS register // The effective size in memory is represented by p_memsz, the length of
if (builtin.arch == .arm and at_hwcap & std.os.linux.HWCAP_TLS == 0) { // the data stored in the PT_TLS segment is p_filesz and may be less
// If the CPU does not support TLS via a coprocessor register, // than the former
// a kernel helper function can be used instead on certain linux kernels. tls_align_factor = phdr.p_align;
// See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c. tls_data = @intToPtr([*]u8, img_base + phdr.p_vaddr)[0..phdr.p_filesz];
@panic("TODO: Implement ARM fallback TLS functionality"); tls_data_alloc_size = phdr.p_memsz;
} } else {
tls_align_factor = @alignOf(*usize);
// Offsets into the allocated TLS area tls_data = &[_]u8{};
var tcb_offset: usize = undefined; tls_data_alloc_size = 0;
var dtv_offset: usize = undefined;
var data_offset: usize = undefined;
var thread_data_offset: usize = undefined;
// Compute the total size of the ABI-specific data plus our own control
// structures
const alloc_size = switch (tls_variant) {
.VariantI => blk: {
var l: usize = 0;
dtv_offset = l;
l += @sizeOf(DTV);
thread_data_offset = l;
l += @sizeOf(CustomData);
l = mem.alignForward(l, phdr.p_align);
tcb_offset = l;
if (tls_tcb_align_size) {
l += mem.alignForward(tls_tcb_size, phdr.p_align);
} else {
l += tls_tcb_size;
}
data_offset = l;
l += phdr.p_memsz;
break :blk l;
},
.VariantII => blk: {
var l: usize = 0;
data_offset = l;
l += phdr.p_memsz;
l = mem.alignForward(l, phdr.p_align);
tcb_offset = l;
l += tls_tcb_size;
thread_data_offset = l;
l += @sizeOf(CustomData);
dtv_offset = l;
l += @sizeOf(DTV);
break :blk l;
},
};
tls_image = TLSImage{
.data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz],
.alloc_size = alloc_size,
.tcb_offset = tcb_offset,
.dtv_offset = dtv_offset,
.data_offset = data_offset,
.gdt_entry_number = @bitCast(usize, @as(isize, -1)),
};
} }
return gnu_stack; // Offsets into the allocated TLS area
var tcb_offset: usize = undefined;
var dtv_offset: usize = undefined;
var data_offset: usize = undefined;
// Compute the total size of the ABI-specific data plus our own control
// structures. All the offset calculated here assume a well-aligned base
// address.
const alloc_size = switch (tls_variant) {
.VariantI => blk: {
var l: usize = 0;
dtv_offset = l;
l += @sizeOf(DTV);
// Add some padding here so that the thread pointer (tcb_offset) is
// aligned to p_align and the CustomData structure can be found by
// simply subtracting its @sizeOf from the tp value
const delta = (l + @sizeOf(CustomData)) & (tls_align_factor - 1);
if (delta > 0)
l += tls_align_factor - delta;
l += @sizeOf(CustomData);
tcb_offset = l;
l += mem.alignForward(tls_tcb_size, tls_align_factor);
data_offset = l;
l += tls_data_alloc_size;
break :blk l;
},
.VariantII => blk: {
var l: usize = 0;
data_offset = l;
l += mem.alignForward(tls_data_alloc_size, tls_align_factor);
// The thread pointer is aligned to p_align
tcb_offset = l;
l += tls_tcb_size;
// The CustomData structure is right after the TCB with no padding
// in between so it can be easily found
l += @sizeOf(CustomData);
l = mem.alignForward(l, @alignOf(DTV));
dtv_offset = l;
l += @sizeOf(DTV);
break :blk l;
},
};
tls_image = TLSImage{
.init_data = tls_data,
.alloc_size = alloc_size,
.alloc_align = tls_align_factor,
.tcb_offset = tcb_offset,
.dtv_offset = dtv_offset,
.data_offset = data_offset,
.data_size = tls_data_alloc_size,
.gdt_entry_number = @bitCast(usize, @as(isize, -1)),
};
} }
pub fn copyTLS(addr: usize) usize { inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
const tls_img = tls_image.?; return @ptrCast(*T, @alignCast(@alignOf(*T), ptr));
}
// Be paranoid, clear the area we're going to use /// Initializes all the fields of the static TLS area and returns the computed
@memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size); /// architecture-specific value of the thread-pointer register
pub fn prepareTLS(area: []u8) usize {
// Clear the area we're going to use, just to be safe
mem.set(u8, area, 0);
// Prepare the DTV // Prepare the DTV
const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset); const dtv = alignPtrCast(DTV, area.ptr + tls_image.dtv_offset);
dtv.entries = 1; dtv.entries = 1;
dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset; dtv.tls_block[0] = area.ptr + tls_dtv_offset + tls_image.data_offset;
// Set-up the TCB // Prepare the TCB
// Force the alignment to 1 byte as the TCB may start from a non-aligned const tcb_ptr = alignPtrCast([*]u8, area.ptr + tls_image.tcb_offset);
// address under the variant II model tcb_ptr.* = switch (tls_variant) {
const tcb_ptr = @intToPtr(*align(1) usize, addr + tls_img.tcb_offset); .VariantI => area.ptr + tls_image.dtv_offset,
if (tls_variant == TLSVariant.VariantI) { .VariantII => area.ptr + tls_image.tcb_offset,
tcb_ptr.* = addr + tls_img.dtv_offset; };
} else {
tcb_ptr.* = addr + tls_img.tcb_offset;
}
// Copy the data // Copy the data
@memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len); mem.copy(u8, area[tls_image.data_offset..], tls_image.init_data);
// Return the corrected (if needed) value for the tp register // Return the corrected (if needed) value for the tp register
return addr + tls_tp_offset + return @ptrToInt(area.ptr) + tls_tp_offset +
if (tls_tp_points_past_tcb) tls_img.data_offset else tls_img.tcb_offset; if (tls_tp_points_past_tcb) tls_image.data_offset else tls_image.tcb_offset;
} }
var main_thread_tls_buffer: [256]u8 align(32) = undefined; var main_thread_tls_buffer: [256]u8 = undefined;
pub fn allocateTLS(size: usize) usize { pub fn initStaticTLS() void {
// Small TLS allocation, use our local buffer initTLS();
if (size < main_thread_tls_buffer.len) {
return @ptrToInt(&main_thread_tls_buffer);
}
const slice = os.mmap( const alloc_tls_area: []u8 = blk: {
null, const full_alloc_size = tls_image.alloc_size + tls_image.alloc_align - 1;
size,
os.PROT_READ | os.PROT_WRITE,
os.MAP_PRIVATE | os.MAP_ANONYMOUS,
-1,
0,
) catch @panic("out of memory");
return @ptrToInt(slice.ptr); // Fast path for the common case where the TLS data is really small,
// avoid an allocation and use our local buffer
if (full_alloc_size < main_thread_tls_buffer.len)
break :blk main_thread_tls_buffer[0..];
break :blk os.mmap(
null,
full_alloc_size,
os.PROT_READ | os.PROT_WRITE,
os.MAP_PRIVATE | os.MAP_ANONYMOUS,
-1,
0,
) catch os.abort();
};
// Make sure the slice is correctly aligned
const start = @ptrToInt(alloc_tls_area.ptr) & (tls_image.alloc_align - 1);
const tls_area = alloc_tls_area[start .. start + tls_image.alloc_size];
const tp_value = prepareTLS(tls_area);
setThreadPointer(tp_value);
} }

View file

@ -152,13 +152,7 @@ fn posixCallMainAndExit() noreturn {
const auxv = @ptrCast([*]std.elf.Auxv, @alignCast(@alignOf(usize), envp.ptr + envp_count + 1)); const auxv = @ptrCast([*]std.elf.Auxv, @alignCast(@alignOf(usize), envp.ptr + envp_count + 1));
std.os.linux.elf_aux_maybe = auxv; std.os.linux.elf_aux_maybe = auxv;
// Initialize the TLS area // Initialize the TLS area
const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size"); std.os.linux.tls.initStaticTLS();
if (std.os.linux.tls.tls_image) |tls_img| {
const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size);
const tp = std.os.linux.tls.copyTLS(tls_addr);
std.os.linux.tls.setThreadPointer(tp);
}
// TODO This is disabled because what should we do when linking libc and this code // TODO This is disabled because what should we do when linking libc and this code
// does not execute? And also it's causing a test failure in stack traces in release modes. // does not execute? And also it's causing a test failure in stack traces in release modes.

View file

@ -286,11 +286,9 @@ pub const Thread = struct {
} }
// Finally, the Thread Local Storage, if any. // Finally, the Thread Local Storage, if any.
if (!Thread.use_pthreads) { if (!Thread.use_pthreads) {
if (os.linux.tls.tls_image) |tls_img| { l = mem.alignForward(l, os.linux.tls.tls_image.alloc_align);
l = mem.alignForward(l, @alignOf(usize)); tls_start_offset = l;
tls_start_offset = l; l += os.linux.tls.tls_image.alloc_size;
l += tls_img.alloc_size;
}
} }
// Round the size to the page size. // Round the size to the page size.
break :blk mem.alignForward(l, mem.page_size); break :blk mem.alignForward(l, mem.page_size);
@ -396,18 +394,21 @@ pub const Thread = struct {
else => return os.unexpectedErrno(@intCast(usize, err)), else => return os.unexpectedErrno(@intCast(usize, err)),
} }
} else if (std.Target.current.os.tag == .linux) { } else if (std.Target.current.os.tag == .linux) {
var flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES | os.CLONE_SIGHAND | const flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES |
os.CLONE_THREAD | os.CLONE_SYSVSEM | os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID | os.CLONE_SIGHAND | os.CLONE_THREAD | os.CLONE_SYSVSEM |
os.CLONE_DETACHED; os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID |
var newtls: usize = undefined; os.CLONE_DETACHED | os.CLONE_SETTLS;
// This structure is only needed when targeting i386 // This structure is only needed when targeting i386
var user_desc: if (std.Target.current.cpu.arch == .i386) os.linux.user_desc else void = undefined; var user_desc: if (std.Target.current.cpu.arch == .i386) os.linux.user_desc else void = undefined;
if (os.linux.tls.tls_image) |tls_img| { const tls_area = mmap_slice[tls_start_offset..];
const tp_value = os.linux.tls.prepareTLS(tls_area);
const newtls = blk: {
if (std.Target.current.cpu.arch == .i386) { if (std.Target.current.cpu.arch == .i386) {
user_desc = os.linux.user_desc{ user_desc = os.linux.user_desc{
.entry_number = tls_img.gdt_entry_number, .entry_number = os.linux.tls.tls_image.gdt_entry_number,
.base_addr = os.linux.tls.copyTLS(mmap_addr + tls_start_offset), .base_addr = tp_value,
.limit = 0xfffff, .limit = 0xfffff,
.seg_32bit = 1, .seg_32bit = 1,
.contents = 0, // Data .contents = 0, // Data
@ -416,12 +417,11 @@ pub const Thread = struct {
.seg_not_present = 0, .seg_not_present = 0,
.useable = 1, .useable = 1,
}; };
newtls = @ptrToInt(&user_desc); break :blk @ptrToInt(&user_desc);
} else { } else {
newtls = os.linux.tls.copyTLS(mmap_addr + tls_start_offset); break :blk tp_value;
} }
flags |= os.CLONE_SETTLS; };
}
const rc = os.linux.clone( const rc = os.linux.clone(
MainFuncs.linuxThreadMain, MainFuncs.linuxThreadMain,

View file

@ -282,7 +282,7 @@ pub fn addCases(cases: *tests.StackTracesContext) void {
\\source.zig:10:8: [address] in main (test) \\source.zig:10:8: [address] in main (test)
\\ foo(); \\ foo();
\\ ^ \\ ^
\\start.zig:256:29: [address] in std.start.posixCallMainAndExit (test) \\start.zig:250:29: [address] in std.start.posixCallMainAndExit (test)
\\ return root.main(); \\ return root.main();
\\ ^ \\ ^
\\start.zig:123:5: [address] in std.start._start (test) \\start.zig:123:5: [address] in std.start._start (test)