mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 13:54:21 +00:00
Merge 4c228fc220 into d0ba6642b5
This commit is contained in:
commit
536cd84b06
1 changed files with 161 additions and 91 deletions
|
|
@ -233,6 +233,153 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
|
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
|
||||||
|
const DFA = struct {
|
||||||
|
const ByteClass = enum {
|
||||||
|
// base ASCII
|
||||||
|
ascii,
|
||||||
|
// continuation bytes
|
||||||
|
cont1,
|
||||||
|
cont2,
|
||||||
|
cont3,
|
||||||
|
// starting bytes of 2-byte codepoint
|
||||||
|
two1,
|
||||||
|
two2,
|
||||||
|
// starting bytes of 3-byte codepoint
|
||||||
|
three1,
|
||||||
|
three2,
|
||||||
|
three3,
|
||||||
|
// starting bytes of 4-byte codepoint
|
||||||
|
four1,
|
||||||
|
four2,
|
||||||
|
four3,
|
||||||
|
four4,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn byte_class(byte: u8) ByteClass {
|
||||||
|
return switch (byte) {
|
||||||
|
0x00...0x7f => .ascii,
|
||||||
|
0x80...0x8f => .cont1,
|
||||||
|
0x90...0x9f => .cont2,
|
||||||
|
0xa0...0xbf => .cont3,
|
||||||
|
0xc0...0xc1 => .two1,
|
||||||
|
0xc2...0xdf => .two2,
|
||||||
|
0xe0...0xe0 => .three1,
|
||||||
|
0xe1...0xec => .three2,
|
||||||
|
0xed...0xed => .three3,
|
||||||
|
0xee...0xef => .three2,
|
||||||
|
0xf0...0xf0 => .four1,
|
||||||
|
0xf1...0xf3 => .four2,
|
||||||
|
0xf4...0xf4 => .four3,
|
||||||
|
0xf5...0xff => .four4,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const State = enum {
|
||||||
|
ok,
|
||||||
|
one,
|
||||||
|
two1,
|
||||||
|
two2,
|
||||||
|
three1,
|
||||||
|
three2,
|
||||||
|
fail,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn offset_from_state(state: State) u5 {
|
||||||
|
return switch (state) {
|
||||||
|
.ok => 8,
|
||||||
|
.one => 13,
|
||||||
|
.two1 => 23,
|
||||||
|
.two2 => 18,
|
||||||
|
.three1 => 3,
|
||||||
|
.three2 => 28,
|
||||||
|
.fail => 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fn state_from_offset(offset: u5) State {
|
||||||
|
return switch (offset) {
|
||||||
|
8 => .ok,
|
||||||
|
13 => .one,
|
||||||
|
23 => .two1,
|
||||||
|
18 => .two2,
|
||||||
|
3 => .three1,
|
||||||
|
28 => .three2,
|
||||||
|
0 => .fail,
|
||||||
|
else => unreachable,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const start: State = .ok;
|
||||||
|
const accept: []State = .{.ok};
|
||||||
|
const fail: ?State = .fail;
|
||||||
|
|
||||||
|
fn step(byte: u8, state: State) State {
|
||||||
|
const class = byte_class(byte);
|
||||||
|
return switch (state) {
|
||||||
|
.ok => switch (class) {
|
||||||
|
.ascii => .ok,
|
||||||
|
.cont1 => .one,
|
||||||
|
.cont2 => .one,
|
||||||
|
.cont3 => .one,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.one => switch (class) {
|
||||||
|
.two2 => .ok,
|
||||||
|
.cont1 => .two1,
|
||||||
|
.cont2 => .two1,
|
||||||
|
.cont3 => .two2,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.two1 => switch (class) {
|
||||||
|
.three2 => .ok,
|
||||||
|
.three3 => .ok,
|
||||||
|
.cont1 => .three1,
|
||||||
|
.cont2 => .three2,
|
||||||
|
.cont3 => .three2,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.two2 => switch (class) {
|
||||||
|
.three1 => .ok,
|
||||||
|
.three2 => .ok,
|
||||||
|
.three3 => switch (surrogates) {
|
||||||
|
.cannot_encode_surrogate_half => .fail,
|
||||||
|
.can_encode_surrogate_half => .ok,
|
||||||
|
},
|
||||||
|
.cont1 => .three1,
|
||||||
|
.cont2 => .three2,
|
||||||
|
.cont3 => .three2,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.three1 => switch (class) {
|
||||||
|
.four2 => .ok,
|
||||||
|
.four3 => .ok,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.three2 => switch (class) {
|
||||||
|
.four1 => .ok,
|
||||||
|
.four2 => .ok,
|
||||||
|
else => .fail,
|
||||||
|
},
|
||||||
|
.fail => .fail,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const shift_table = blk: {
|
||||||
|
@setEvalBranchQuota(30000);
|
||||||
|
var t: [256]u32 = @splat(0);
|
||||||
|
for (&t, 0..) |*r, c| {
|
||||||
|
for (std.enums.values(State)) |s| {
|
||||||
|
r.* |= @truncate(@as(u32, offset_from_state(step(c, s))) << offset_from_state(s));
|
||||||
|
}
|
||||||
|
// Make sure the states didn't overlap and destroy themselves
|
||||||
|
for (std.enums.values(State)) |s| {
|
||||||
|
std.debug.assert(@as(u5, @truncate(r.* >> offset_from_state(s))) == offset_from_state(step(c, s)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break :blk t;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
var remaining = input;
|
var remaining = input;
|
||||||
|
|
||||||
if (std.simd.suggestVectorLength(u8)) |chunk_len| {
|
if (std.simd.suggestVectorLength(u8)) |chunk_len| {
|
||||||
|
|
@ -250,101 +397,24 @@ fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) boo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// default lowest and highest continuation byte
|
var state: u32 = DFA.offset_from_state(DFA.State.ok);
|
||||||
const lo_cb = 0b10000000;
|
// Manually unrolled to insert early return.
|
||||||
const hi_cb = 0b10111111;
|
const UNROLL = 8;
|
||||||
|
while (remaining.len > UNROLL) {
|
||||||
const min_non_ascii_codepoint = 0x80;
|
for (0..UNROLL) |i| {
|
||||||
|
const byte = remaining[remaining.len - 1 - i];
|
||||||
// The first nibble is used to identify the continuation byte range to
|
state = DFA.shift_table[byte] >> @truncate(state);
|
||||||
// accept. The second nibble is the size.
|
|
||||||
const xx = 0xF1; // invalid: size 1
|
|
||||||
const as = 0xF0; // ASCII: size 1
|
|
||||||
const s1 = 0x02; // accept 0, size 2
|
|
||||||
const s2 = switch (surrogates) {
|
|
||||||
.cannot_encode_surrogate_half => 0x13, // accept 1, size 3
|
|
||||||
.can_encode_surrogate_half => 0x03, // accept 0, size 3
|
|
||||||
};
|
|
||||||
const s3 = 0x03; // accept 0, size 3
|
|
||||||
const s4 = switch (surrogates) {
|
|
||||||
.cannot_encode_surrogate_half => 0x23, // accept 2, size 3
|
|
||||||
.can_encode_surrogate_half => 0x03, // accept 0, size 3
|
|
||||||
};
|
|
||||||
const s5 = 0x34; // accept 3, size 4
|
|
||||||
const s6 = 0x04; // accept 0, size 4
|
|
||||||
const s7 = 0x44; // accept 4, size 4
|
|
||||||
|
|
||||||
// Information about the first byte in a UTF-8 sequence.
|
|
||||||
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
|
|
||||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
|
|
||||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
|
|
||||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
|
|
||||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
|
|
||||||
};
|
|
||||||
|
|
||||||
const n = remaining.len;
|
|
||||||
var i: usize = 0;
|
|
||||||
while (i < n) {
|
|
||||||
const first_byte = remaining[i];
|
|
||||||
if (first_byte < min_non_ascii_codepoint) {
|
|
||||||
i += 1;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
remaining = remaining[0 .. remaining.len - UNROLL];
|
||||||
const info = first[first_byte];
|
if (@as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.fail)) {
|
||||||
if (info == xx) {
|
|
||||||
return false; // Illegal starter byte.
|
|
||||||
}
|
|
||||||
|
|
||||||
const size = info & 7;
|
|
||||||
if (i + size > n) {
|
|
||||||
return false; // Short or invalid.
|
|
||||||
}
|
|
||||||
|
|
||||||
// Figure out the acceptable low and high continuation bytes, starting
|
|
||||||
// with our defaults.
|
|
||||||
var accept_lo: u8 = lo_cb;
|
|
||||||
var accept_hi: u8 = hi_cb;
|
|
||||||
|
|
||||||
switch (info >> 4) {
|
|
||||||
0 => {},
|
|
||||||
1 => accept_lo = 0xA0,
|
|
||||||
2 => accept_hi = 0x9F,
|
|
||||||
3 => accept_lo = 0x90,
|
|
||||||
4 => accept_hi = 0x8F,
|
|
||||||
else => unreachable,
|
|
||||||
}
|
|
||||||
|
|
||||||
const c1 = remaining[i + 1];
|
|
||||||
if (c1 < accept_lo or accept_hi < c1) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (size) {
|
|
||||||
2 => i += 2,
|
|
||||||
3 => {
|
|
||||||
const c2 = remaining[i + 2];
|
|
||||||
if (c2 < lo_cb or hi_cb < c2) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
i += 3;
|
for (0..remaining.len) |i| {
|
||||||
},
|
const byte = remaining[remaining.len - 1 - i];
|
||||||
4 => {
|
state = DFA.shift_table[byte] >> @truncate(state);
|
||||||
const c2 = remaining[i + 2];
|
|
||||||
if (c2 < lo_cb or hi_cb < c2) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
const c3 = remaining[i + 3];
|
return @as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.ok);
|
||||||
if (c3 < lo_cb or hi_cb < c3) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
i += 4;
|
|
||||||
},
|
|
||||||
else => unreachable,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Utf8View iterates the code points of a utf-8 encoded string.
|
/// Utf8View iterates the code points of a utf-8 encoded string.
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue