This commit is contained in:
Nurul Huda (Apon) 2025-11-25 13:29:46 +01:00 committed by GitHub
commit 588f3f3155
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 125 additions and 19 deletions

View file

@ -64,6 +64,7 @@ pub const ValueOptions = struct {
emit_codepoint_literals: EmitCodepointLiterals = .never,
emit_strings_as_containers: bool = false,
emit_default_optional_fields: bool = true,
escape_non_ascii: bool = false,
};
/// Determines when to emit Unicode code point literals as opposed to integer literals.
@ -125,7 +126,7 @@ pub fn valueArbitraryDepth(self: *Serializer, val: anytype, options: ValueOption
comptime assert(canSerializeType(@TypeOf(val)));
switch (@typeInfo(@TypeOf(val))) {
.int, .comptime_int => if (options.emit_codepoint_literals.emitAsCodepoint(val)) |c| {
self.codePoint(c) catch |err| switch (err) {
self.codePoint(c, .{ .escape_non_ascii = options.escape_non_ascii }) catch |err| switch (err) {
error.InvalidCodepoint => unreachable, // Already validated
else => |e| return e,
};
@ -146,7 +147,7 @@ pub fn valueArbitraryDepth(self: *Serializer, val: anytype, options: ValueOption
(pointer.sentinel() == null or pointer.sentinel() == 0) and
!options.emit_strings_as_containers)
{
return try self.string(val);
return try self.string(val, .{ .escape_non_ascii = options.escape_non_ascii });
}
// Serialize as either a tuple or as the child type
@ -280,12 +281,21 @@ pub fn ident(self: *Serializer, name: []const u8) Error!void {
}
pub const CodePointError = Error || error{InvalidCodepoint};
/// Options for formatting code points.
pub const CodePointOptions = struct {
escape_non_ascii: bool = false,
};
/// Serialize `val` as a Unicode codepoint.
///
/// Returns `error.InvalidCodepoint` if `val` is not a valid Unicode codepoint.
pub fn codePoint(self: *Serializer, val: u21) CodePointError!void {
try self.writer.print("'{f}'", .{std.zig.fmtChar(val)});
pub fn codePoint(self: *Serializer, val: u21, options: CodePointOptions) CodePointError!void {
try self.writer.writeByte('\'');
try self.writeCodepoint(val, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .single,
});
try self.writer.writeByte('\'');
}
/// Like `value`, but always serializes `val` as a tuple.
@ -341,9 +351,101 @@ fn tupleImpl(self: *Serializer, val: anytype, options: ValueOptions) Error!void
}
}
/// Options for writing a Unicode codepoint.
const WriteCodepointOptions = struct {
escape_non_ascii: bool = false,
/// If single quote style then single quotes are escaped, otherwise double quotes are escaped.
quote_style: enum { single, double } = .single,
};
/// Write a Unicode codepoint to the writer using the given options.
///
/// Returns `error.InvalidCodepoint` if `codepoint` is not a valid Unicode codepoint.
fn writeCodepoint(self: *Serializer, val: u21, options: WriteCodepointOptions) CodePointError!void {
switch (val) {
// Printable ASCII
' ', '!', '#'...'&', '('...'[', ']'...'~' => try self.writer.writeByte(@intCast(val)),
// Unprintable ASCII
0x00...0x08, 0x0B, 0x0C, 0x0E...0x1F, 0x7F => try self.writer.print("\\x{x:0>2}", .{val}),
// ASCII with special escapes
'\n' => try self.writer.writeAll("\\n"),
'\r' => try self.writer.writeAll("\\r"),
'\t' => try self.writer.writeAll("\\t"),
'\\' => try self.writer.writeAll("\\\\"),
// Quotes need escaping if they conflict with the in-use quote character
'\'' => if (options.quote_style == .single) try self.writer.writeAll("\\'") else try self.writer.writeByte('\''),
'\"' => if (options.quote_style == .double) try self.writer.writeAll("\\\"") else try self.writer.writeByte('"'),
// Non-ASCII but still one byte
0x80...0xFF => if (options.escape_non_ascii) {
try self.writer.print("\\x{x:0>2}", .{val});
} else {
try self.writer.writeByte(@intCast(val));
},
// Surrogates can only be written with an escape
0xD800...0xDFFF => try self.writer.print("\\u{{{x}}}", .{val}),
// Other valid codepoints
0x100...0xD7FF, 0xE000...0x10FFFF => if (options.escape_non_ascii) {
try self.writer.print("\\u{{{x}}}", .{val});
} else {
var buf: [7]u8 = undefined;
const len = std.unicode.utf8Encode(val, &buf) catch unreachable;
try self.writer.writeAll(buf[0..len]);
},
// Invalid codepoints
0x110000...std.math.maxInt(u21) => return error.InvalidCodepoint,
}
}
pub const StringOptions = struct {
escape_non_ascii: bool = false,
};
/// Like `value`, but always serializes `val` as a string.
pub fn string(self: *Serializer, val: []const u8) Error!void {
try self.writer.print("\"{f}\"", .{std.zig.fmtString(val)});
pub fn string(self: *Serializer, val: []const u8, options: StringOptions) Writer.Error!void {
try self.writer.writeByte('"');
// Batch write sequences of "raw" bytes (printable ASCII or non-escaped non-ASCII) for performance.
// `val[start..i]` contains pending raw bytes to write.
var start: usize = 0;
var i: usize = 0;
while (i < val.len) {
const byte = val[i];
// Check if this byte can be written as-is
const is_raw = switch (byte) {
' ', '!', '#'...'[', ']'...'~' => true,
0x80...0xFF => !options.escape_non_ascii,
else => false,
};
if (is_raw) {
i += 1;
continue;
}
// Flush pending raw bytes
try self.writer.writeAll(val[start..i]);
// Handle the special character
if (byte >= 0x80) {
// Decode UTF-8 sequence and write the codepoint
const ulen = std.unicode.utf8ByteSequenceLength(byte) catch unreachable;
const codepoint = std.unicode.utf8Decode(val[i..][0..ulen]) catch unreachable;
// InvalidCodepoint cannot occur from valid UTF-8
self.writeCodepoint(codepoint, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .double,
}) catch unreachable;
i += ulen;
} else {
// ASCII character that needs escaping
self.writeCodepoint(byte, .{
.escape_non_ascii = options.escape_non_ascii,
.quote_style = .double,
}) catch unreachable; // InvalidCodepoint cannot occur for valid ASCII values
i += 1;
}
start = i;
}
try self.writer.writeAll(val[start..]);
try self.writer.writeByte('"');
}
/// Options for formatting multiline strings.

View file

@ -24,7 +24,7 @@
const std = @import("std");
const assert = std.debug.assert;
const Writer = std.Io.Writer;
const Serializer = std.zon.Serializer;
const Serializer = @import("Serializer.zig");
pub const SerializeOptions = struct {
/// If false, whitespace is omitted. Otherwise whitespace is emitted in standard Zig style.
@ -37,6 +37,8 @@ pub const SerializeOptions = struct {
/// If false, struct fields are not written if they are equal to their default value. Comparison
/// is done by `std.meta.eql`.
emit_default_optional_fields: bool = true,
/// If true, non-ASCII unicode characters are escaped.
escape_non_ascii: bool = false,
};
/// Serialize the given value as ZON.
@ -51,6 +53,7 @@ pub fn serialize(val: anytype, options: SerializeOptions, writer: *Writer) Write
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
});
}
@ -72,6 +75,7 @@ pub fn serializeMaxDepth(
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
}, depth);
}
@ -91,6 +95,7 @@ pub fn serializeArbitraryDepth(
.emit_codepoint_literals = options.emit_codepoint_literals,
.emit_strings_as_containers = options.emit_strings_as_containers,
.emit_default_optional_fields = options.emit_default_optional_fields,
.escape_non_ascii = options.escape_non_ascii,
});
}
@ -588,7 +593,7 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("97", aw.written());
aw.clearRetainingCapacity();
try s.codePoint('a');
try s.codePoint('a', .{});
try std.testing.expectEqualStrings("'a'", aw.written());
aw.clearRetainingCapacity();
@ -609,7 +614,7 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("10", aw.written());
aw.clearRetainingCapacity();
try s.codePoint('\n');
try s.codePoint('\n', .{});
try std.testing.expectEqualStrings("'\\n'", aw.written());
aw.clearRetainingCapacity();
@ -630,11 +635,11 @@ test "std.zon stringify utf8 codepoints" {
try std.testing.expectEqualStrings("9889", aw.written());
aw.clearRetainingCapacity();
try s.codePoint('⚡');
try s.codePoint('⚡', .{ .escape_non_ascii = true });
try std.testing.expectEqualStrings("'\\u{26a1}'", aw.written());
aw.clearRetainingCapacity();
try s.value('⚡', .{ .emit_codepoint_literals = .always });
try s.value('⚡', .{ .emit_codepoint_literals = .always, .escape_non_ascii = true });
try std.testing.expectEqualStrings("'\\u{26a1}'", aw.written());
aw.clearRetainingCapacity();
@ -647,8 +652,7 @@ test "std.zon stringify utf8 codepoints" {
aw.clearRetainingCapacity();
// Invalid codepoint
try s.codePoint(0x110000 + 1);
try std.testing.expectEqualStrings("'\\u{110001}'", aw.written());
try std.testing.expectError(error.InvalidCodepoint, s.codePoint(0x110000 + 1, .{ .escape_non_ascii = true }));
aw.clearRetainingCapacity();
try s.int(0x110000 + 1);
@ -681,7 +685,7 @@ test "std.zon stringify utf8 codepoints" {
aw.clearRetainingCapacity();
// Make sure value options are passed to children
try s.value(.{ .c = '⚡' }, .{ .emit_codepoint_literals = .always });
try s.value(.{ .c = '⚡' }, .{ .emit_codepoint_literals = .always, .escape_non_ascii = true });
try std.testing.expectEqualStrings(".{ .c = '\\u{26a1}' }", aw.written());
aw.clearRetainingCapacity();
@ -696,8 +700,8 @@ test "std.zon stringify strings" {
defer aw.deinit();
// Minimal case
try s.string("abc⚡\n");
try std.testing.expectEqualStrings("\"abc\\xe2\\x9a\\xa1\\n\"", aw.written());
try s.string("abc⚡\n", .{ .escape_non_ascii = true });
try std.testing.expectEqualStrings("\"abc\\u{26a1}\\n\"", aw.written());
aw.clearRetainingCapacity();
try s.tuple("abc⚡\n", .{});
@ -714,8 +718,8 @@ test "std.zon stringify strings" {
, aw.written());
aw.clearRetainingCapacity();
try s.value("abc⚡\n", .{});
try std.testing.expectEqualStrings("\"abc\\xe2\\x9a\\xa1\\n\"", aw.written());
try s.value("abc⚡\n", .{ .escape_non_ascii = false });
try std.testing.expectEqualStrings("\"abc\\n\"", aw.written());
aw.clearRetainingCapacity();
try s.value("abc⚡\n", .{ .emit_strings_as_containers = true });
@ -816,7 +820,7 @@ test "std.zon stringify multiline strings" {
{
const str: []const u8 = &.{ 'a', '\r', 'c' };
try s.string(str);
try s.string(str, .{ .escape_non_ascii = false });
try std.testing.expectEqualStrings("\"a\\rc\"", aw.written());
aw.clearRetainingCapacity();
}