mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 22:04:21 +00:00
1660 lines
59 KiB
Zig
1660 lines
59 KiB
Zig
//! A Markdown parser producing `Document`s.
|
|
//!
|
|
//! The parser operates at two levels: at the outer level, the parser accepts
|
|
//! the content of an input document line by line and begins building the _block
|
|
//! structure_ of the document. This creates a stack of currently open blocks.
|
|
//!
|
|
//! When the parser detects the end of a block, it closes the block, popping it
|
|
//! from the open block stack and completing any additional parsing of the
|
|
//! block's content. For blocks which contain parseable inline content, this
|
|
//! invokes the inner level of the parser, handling the _inline structure_ of
|
|
//! the block.
|
|
//!
|
|
//! Inline parsing scans through the collected inline content of a block. When
|
|
//! it encounters a character that could indicate the beginning of an inline, it
|
|
//! either handles the inline right away (if possible) or adds it to a pending
|
|
//! inlines stack. When an inline is completed, it is added to a list of
|
|
//! completed inlines, which (along with any surrounding text nodes) will become
|
|
//! the children of the parent inline or the block whose inline content is being
|
|
//! parsed.
|
|
|
|
const std = @import("std");
|
|
const mem = std.mem;
|
|
const assert = std.debug.assert;
|
|
const isWhitespace = std.ascii.isWhitespace;
|
|
const Allocator = mem.Allocator;
|
|
const expectEqual = std.testing.expectEqual;
|
|
const Document = @import("Document.zig");
|
|
const Node = Document.Node;
|
|
const ExtraIndex = Document.ExtraIndex;
|
|
const ExtraData = Document.ExtraData;
|
|
const StringIndex = Document.StringIndex;
|
|
|
|
nodes: Node.List = .{},
|
|
extra: std.ArrayListUnmanaged(u32) = .empty,
|
|
scratch_extra: std.ArrayListUnmanaged(u32) = .empty,
|
|
string_bytes: std.ArrayListUnmanaged(u8) = .empty,
|
|
scratch_string: std.ArrayListUnmanaged(u8) = .empty,
|
|
pending_blocks: std.ArrayListUnmanaged(Block) = .empty,
|
|
allocator: Allocator,
|
|
|
|
const Parser = @This();
|
|
|
|
/// An arbitrary limit on the maximum number of columns in a table so that
|
|
/// table-related metadata maintained by the parser does not require dynamic
|
|
/// memory allocation.
|
|
const max_table_columns = 128;
|
|
|
|
/// A block element which is still receiving children.
|
|
const Block = struct {
|
|
tag: Tag,
|
|
data: Data,
|
|
extra_start: usize,
|
|
string_start: usize,
|
|
|
|
const Tag = enum {
|
|
/// Data is `list`.
|
|
list,
|
|
/// Data is `list_item`.
|
|
list_item,
|
|
/// Data is `table`.
|
|
table,
|
|
/// Data is `none`.
|
|
table_row,
|
|
/// Data is `heading`.
|
|
heading,
|
|
/// Data is `code_block`.
|
|
code_block,
|
|
/// Data is `none`.
|
|
blockquote,
|
|
/// Data is `none`.
|
|
paragraph,
|
|
/// Data is `none`.
|
|
thematic_break,
|
|
};
|
|
|
|
const Data = union {
|
|
none: void,
|
|
list: struct {
|
|
marker: ListMarker,
|
|
/// Between 0 and 999,999,999, inclusive.
|
|
start: u30,
|
|
tight: bool,
|
|
last_line_blank: bool = false,
|
|
},
|
|
list_item: struct {
|
|
continuation_indent: usize,
|
|
},
|
|
table: struct {
|
|
column_alignments: std.BoundedArray(Node.TableCellAlignment, max_table_columns) = .{},
|
|
},
|
|
heading: struct {
|
|
/// Between 1 and 6, inclusive.
|
|
level: u3,
|
|
},
|
|
code_block: struct {
|
|
tag: StringIndex,
|
|
fence_len: usize,
|
|
indent: usize,
|
|
},
|
|
|
|
const ListMarker = enum {
|
|
@"-",
|
|
@"*",
|
|
@"+",
|
|
number_dot,
|
|
number_paren,
|
|
};
|
|
};
|
|
|
|
const ContentType = enum {
|
|
blocks,
|
|
inlines,
|
|
raw_inlines,
|
|
nothing,
|
|
};
|
|
|
|
fn canAccept(b: Block) ContentType {
|
|
return switch (b.tag) {
|
|
.list,
|
|
.list_item,
|
|
.table,
|
|
.blockquote,
|
|
=> .blocks,
|
|
|
|
.heading,
|
|
.paragraph,
|
|
=> .inlines,
|
|
|
|
.code_block,
|
|
=> .raw_inlines,
|
|
|
|
.table_row,
|
|
.thematic_break,
|
|
=> .nothing,
|
|
};
|
|
}
|
|
|
|
/// Attempts to continue `b` using the contents of `line`. If successful,
|
|
/// returns the remaining portion of `line` to be considered part of `b`
|
|
/// (e.g. for a blockquote, this would be everything except the leading
|
|
/// `>`). If unsuccessful, returns null.
|
|
fn match(b: Block, line: []const u8) ?[]const u8 {
|
|
const unindented = mem.trimLeft(u8, line, " \t");
|
|
const indent = line.len - unindented.len;
|
|
return switch (b.tag) {
|
|
.list => line,
|
|
.list_item => if (indent >= b.data.list_item.continuation_indent)
|
|
line[b.data.list_item.continuation_indent..]
|
|
else if (unindented.len == 0)
|
|
// Blank lines should not close list items, since there may be
|
|
// more indented contents to follow after the blank line.
|
|
""
|
|
else
|
|
null,
|
|
.table => if (unindented.len > 0) line else null,
|
|
.table_row => null,
|
|
.heading => null,
|
|
.code_block => code_block: {
|
|
const trimmed = mem.trimRight(u8, unindented, " \t");
|
|
if (mem.indexOfNone(u8, trimmed, "`") != null or trimmed.len != b.data.code_block.fence_len) {
|
|
const effective_indent = @min(indent, b.data.code_block.indent);
|
|
break :code_block line[effective_indent..];
|
|
} else {
|
|
break :code_block null;
|
|
}
|
|
},
|
|
.blockquote => if (mem.startsWith(u8, unindented, ">"))
|
|
unindented[1..]
|
|
else
|
|
null,
|
|
.paragraph => if (unindented.len > 0) line else null,
|
|
.thematic_break => null,
|
|
};
|
|
}
|
|
};
|
|
|
|
pub fn init(allocator: Allocator) Allocator.Error!Parser {
|
|
var p: Parser = .{ .allocator = allocator };
|
|
try p.nodes.append(allocator, .{
|
|
.tag = .root,
|
|
.data = undefined,
|
|
});
|
|
try p.string_bytes.append(allocator, 0);
|
|
return p;
|
|
}
|
|
|
|
pub fn deinit(p: *Parser) void {
|
|
p.nodes.deinit(p.allocator);
|
|
p.extra.deinit(p.allocator);
|
|
p.scratch_extra.deinit(p.allocator);
|
|
p.string_bytes.deinit(p.allocator);
|
|
p.scratch_string.deinit(p.allocator);
|
|
p.pending_blocks.deinit(p.allocator);
|
|
p.* = undefined;
|
|
}
|
|
|
|
/// Accepts a single line of content. `line` should not have a trailing line
|
|
/// ending character.
|
|
pub fn feedLine(p: *Parser, line: []const u8) Allocator.Error!void {
|
|
var rest_line = line;
|
|
const first_unmatched = for (p.pending_blocks.items, 0..) |b, i| {
|
|
if (b.match(rest_line)) |rest| {
|
|
rest_line = rest;
|
|
} else {
|
|
break i;
|
|
}
|
|
} else p.pending_blocks.items.len;
|
|
|
|
const in_code_block = p.pending_blocks.items.len > 0 and
|
|
p.pending_blocks.getLast().tag == .code_block;
|
|
const code_block_end = in_code_block and
|
|
first_unmatched + 1 == p.pending_blocks.items.len;
|
|
// New blocks cannot be started if we are actively inside a code block or
|
|
// are just closing one (to avoid interpreting the closing ``` as a new code
|
|
// block start).
|
|
var maybe_block_start = if (!in_code_block or first_unmatched + 2 <= p.pending_blocks.items.len)
|
|
try p.startBlock(rest_line)
|
|
else
|
|
null;
|
|
|
|
// This is a lazy continuation line if there are no new blocks to open and
|
|
// the last open block is a paragraph.
|
|
if (maybe_block_start == null and
|
|
!isBlank(rest_line) and
|
|
p.pending_blocks.items.len > 0 and
|
|
p.pending_blocks.getLast().tag == .paragraph)
|
|
{
|
|
try p.addScratchStringLine(mem.trimLeft(u8, rest_line, " \t"));
|
|
return;
|
|
}
|
|
|
|
// If a new block needs to be started, any paragraph needs to be closed,
|
|
// even though this isn't detected as part of the closing condition for
|
|
// paragraphs.
|
|
if (maybe_block_start != null and
|
|
p.pending_blocks.items.len > 0 and
|
|
p.pending_blocks.getLast().tag == .paragraph)
|
|
{
|
|
try p.closeLastBlock();
|
|
}
|
|
|
|
while (p.pending_blocks.items.len > first_unmatched) {
|
|
try p.closeLastBlock();
|
|
}
|
|
|
|
while (maybe_block_start) |block_start| : (maybe_block_start = try p.startBlock(rest_line)) {
|
|
try p.appendBlockStart(block_start);
|
|
// There may be more blocks to start within the same line.
|
|
rest_line = block_start.rest;
|
|
// Headings may only contain inline content.
|
|
if (block_start.tag == .heading) break;
|
|
// An opening code fence does not contain any additional block or inline
|
|
// content to process.
|
|
if (block_start.tag == .code_block) return;
|
|
}
|
|
|
|
// Do not append the end of a code block (```) as textual content.
|
|
if (code_block_end) return;
|
|
|
|
const can_accept = if (p.pending_blocks.getLastOrNull()) |last_pending_block|
|
|
last_pending_block.canAccept()
|
|
else
|
|
.blocks;
|
|
const rest_line_trimmed = mem.trimLeft(u8, rest_line, " \t");
|
|
switch (can_accept) {
|
|
.blocks => {
|
|
// If we're inside a list item and the rest of the line is blank, it
|
|
// means that any subsequent child of the list item (or subsequent
|
|
// item in the list) will cause the containing list to be considered
|
|
// loose. However, we can't immediately declare that the list is
|
|
// loose, since we might just be looking at a blank line after the
|
|
// end of the last item in the list. The final determination will be
|
|
// made when appending the next child of the list or list item.
|
|
const maybe_containing_list_index = if (p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .list_item)
|
|
p.pending_blocks.items.len - 2
|
|
else
|
|
null;
|
|
|
|
if (rest_line_trimmed.len > 0) {
|
|
try p.appendBlockStart(.{
|
|
.tag = .paragraph,
|
|
.data = .{ .none = {} },
|
|
.rest = undefined,
|
|
});
|
|
try p.addScratchStringLine(rest_line_trimmed);
|
|
}
|
|
|
|
if (maybe_containing_list_index) |containing_list_index| {
|
|
p.pending_blocks.items[containing_list_index].data.list.last_line_blank = rest_line_trimmed.len == 0;
|
|
}
|
|
},
|
|
.inlines => try p.addScratchStringLine(rest_line_trimmed),
|
|
.raw_inlines => try p.addScratchStringLine(rest_line),
|
|
.nothing => {},
|
|
}
|
|
}
|
|
|
|
/// Completes processing of the input and returns the parsed document.
|
|
pub fn endInput(p: *Parser) Allocator.Error!Document {
|
|
while (p.pending_blocks.items.len > 0) {
|
|
try p.closeLastBlock();
|
|
}
|
|
// There should be no inline content pending after closing the last open
|
|
// block.
|
|
assert(p.scratch_string.items.len == 0);
|
|
|
|
const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items));
|
|
p.nodes.items(.data)[0] = .{ .container = .{ .children = children } };
|
|
p.scratch_string.items.len = 0;
|
|
p.scratch_extra.items.len = 0;
|
|
|
|
var nodes = p.nodes.toOwnedSlice();
|
|
errdefer nodes.deinit(p.allocator);
|
|
const extra = try p.extra.toOwnedSlice(p.allocator);
|
|
errdefer p.allocator.free(extra);
|
|
const string_bytes = try p.string_bytes.toOwnedSlice(p.allocator);
|
|
errdefer p.allocator.free(string_bytes);
|
|
|
|
return .{
|
|
.nodes = nodes,
|
|
.extra = extra,
|
|
.string_bytes = string_bytes,
|
|
};
|
|
}
|
|
|
|
/// Data describing the start of a new block element.
|
|
const BlockStart = struct {
|
|
tag: Tag,
|
|
data: Data,
|
|
rest: []const u8,
|
|
|
|
const Tag = enum {
|
|
/// Data is `list_item`.
|
|
list_item,
|
|
/// Data is `table_row`.
|
|
table_row,
|
|
/// Data is `heading`.
|
|
heading,
|
|
/// Data is `code_block`.
|
|
code_block,
|
|
/// Data is `none`.
|
|
blockquote,
|
|
/// Data is `none`.
|
|
paragraph,
|
|
/// Data is `none`.
|
|
thematic_break,
|
|
};
|
|
|
|
const Data = union {
|
|
none: void,
|
|
list_item: struct {
|
|
marker: Block.Data.ListMarker,
|
|
number: u30,
|
|
continuation_indent: usize,
|
|
},
|
|
table_row: struct {
|
|
cells: std.BoundedArray([]const u8, max_table_columns),
|
|
},
|
|
heading: struct {
|
|
/// Between 1 and 6, inclusive.
|
|
level: u3,
|
|
},
|
|
code_block: struct {
|
|
tag: StringIndex,
|
|
fence_len: usize,
|
|
indent: usize,
|
|
},
|
|
};
|
|
};
|
|
|
|
fn appendBlockStart(p: *Parser, block_start: BlockStart) !void {
|
|
if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
|
|
// Close the last block if it is a list and the new block is not a list item
|
|
// or not of the same marker type.
|
|
const should_close_list = last_pending_block.tag == .list and
|
|
(block_start.tag != .list_item or
|
|
block_start.data.list_item.marker != last_pending_block.data.list.marker);
|
|
// The last block should also be closed if the new block is not a table
|
|
// row, which is the only allowed child of a table.
|
|
const should_close_table = last_pending_block.tag == .table and
|
|
block_start.tag != .table_row;
|
|
if (should_close_list or should_close_table) {
|
|
try p.closeLastBlock();
|
|
}
|
|
}
|
|
|
|
if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
|
|
// If the last block is a list or list item, check for tightness based
|
|
// on the last line.
|
|
const maybe_containing_list = switch (last_pending_block.tag) {
|
|
.list => &p.pending_blocks.items[p.pending_blocks.items.len - 1],
|
|
.list_item => &p.pending_blocks.items[p.pending_blocks.items.len - 2],
|
|
else => null,
|
|
};
|
|
if (maybe_containing_list) |containing_list| {
|
|
if (containing_list.data.list.last_line_blank) {
|
|
containing_list.data.list.tight = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start a new list if the new block is a list item and there is no
|
|
// containing list yet.
|
|
if (block_start.tag == .list_item and
|
|
(p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .list))
|
|
{
|
|
try p.pending_blocks.append(p.allocator, .{
|
|
.tag = .list,
|
|
.data = .{ .list = .{
|
|
.marker = block_start.data.list_item.marker,
|
|
.start = block_start.data.list_item.number,
|
|
.tight = true,
|
|
} },
|
|
.string_start = p.scratch_string.items.len,
|
|
.extra_start = p.scratch_extra.items.len,
|
|
});
|
|
}
|
|
|
|
if (block_start.tag == .table_row) {
|
|
// Likewise, table rows start a table implicitly.
|
|
if (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .table) {
|
|
try p.pending_blocks.append(p.allocator, .{
|
|
.tag = .table,
|
|
.data = .{ .table = .{
|
|
.column_alignments = .{},
|
|
} },
|
|
.string_start = p.scratch_string.items.len,
|
|
.extra_start = p.scratch_extra.items.len,
|
|
});
|
|
}
|
|
|
|
const current_row = p.scratch_extra.items.len - p.pending_blocks.getLast().extra_start;
|
|
if (current_row <= 1) {
|
|
if (parseTableHeaderDelimiter(block_start.data.table_row.cells)) |alignments| {
|
|
p.pending_blocks.items[p.pending_blocks.items.len - 1].data.table.column_alignments = alignments;
|
|
if (current_row == 1) {
|
|
// We need to go back and mark the header row and its column
|
|
// alignments.
|
|
const datas = p.nodes.items(.data);
|
|
const header_data = datas[p.scratch_extra.getLast()];
|
|
for (p.extraChildren(header_data.container.children), 0..) |header_cell, i| {
|
|
const alignment = if (i < alignments.len) alignments.buffer[i] else .unset;
|
|
const cell_data = &datas[@intFromEnum(header_cell)].table_cell;
|
|
cell_data.info.alignment = alignment;
|
|
cell_data.info.header = true;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
const tag: Block.Tag, const data: Block.Data = switch (block_start.tag) {
|
|
.list_item => .{ .list_item, .{ .list_item = .{
|
|
.continuation_indent = block_start.data.list_item.continuation_indent,
|
|
} } },
|
|
.table_row => .{ .table_row, .{ .none = {} } },
|
|
.heading => .{ .heading, .{ .heading = .{
|
|
.level = block_start.data.heading.level,
|
|
} } },
|
|
.code_block => .{ .code_block, .{ .code_block = .{
|
|
.tag = block_start.data.code_block.tag,
|
|
.fence_len = block_start.data.code_block.fence_len,
|
|
.indent = block_start.data.code_block.indent,
|
|
} } },
|
|
.blockquote => .{ .blockquote, .{ .none = {} } },
|
|
.paragraph => .{ .paragraph, .{ .none = {} } },
|
|
.thematic_break => .{ .thematic_break, .{ .none = {} } },
|
|
};
|
|
|
|
try p.pending_blocks.append(p.allocator, .{
|
|
.tag = tag,
|
|
.data = data,
|
|
.string_start = p.scratch_string.items.len,
|
|
.extra_start = p.scratch_extra.items.len,
|
|
});
|
|
|
|
if (tag == .table_row) {
|
|
// Table rows are unique, since we already have all the children
|
|
// available in the BlockStart. We can immediately parse and append
|
|
// these children now.
|
|
const containing_table = p.pending_blocks.items[p.pending_blocks.items.len - 2];
|
|
const column_alignments = containing_table.data.table.column_alignments.slice();
|
|
for (block_start.data.table_row.cells.slice(), 0..) |cell_content, i| {
|
|
const cell_children = try p.parseInlines(cell_content);
|
|
const alignment = if (i < column_alignments.len) column_alignments[i] else .unset;
|
|
const cell = try p.addNode(.{
|
|
.tag = .table_cell,
|
|
.data = .{ .table_cell = .{
|
|
.info = .{
|
|
.alignment = alignment,
|
|
.header = false,
|
|
},
|
|
.children = cell_children,
|
|
} },
|
|
});
|
|
try p.addScratchExtraNode(cell);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn startBlock(p: *Parser, line: []const u8) !?BlockStart {
|
|
const unindented = mem.trimLeft(u8, line, " \t");
|
|
const indent = line.len - unindented.len;
|
|
if (isThematicBreak(line)) {
|
|
// Thematic breaks take precedence over list items.
|
|
return .{
|
|
.tag = .thematic_break,
|
|
.data = .{ .none = {} },
|
|
.rest = "",
|
|
};
|
|
} else if (startListItem(unindented)) |list_item| {
|
|
return .{
|
|
.tag = .list_item,
|
|
.data = .{ .list_item = .{
|
|
.marker = list_item.marker,
|
|
.number = list_item.number,
|
|
.continuation_indent = indent + list_item.marker_len,
|
|
} },
|
|
.rest = list_item.rest,
|
|
};
|
|
} else if (startTableRow(unindented)) |table_row| {
|
|
return .{
|
|
.tag = .table_row,
|
|
.data = .{ .table_row = .{
|
|
.cells = table_row.cells,
|
|
} },
|
|
.rest = "",
|
|
};
|
|
} else if (startHeading(unindented)) |heading| {
|
|
return .{
|
|
.tag = .heading,
|
|
.data = .{ .heading = .{
|
|
.level = heading.level,
|
|
} },
|
|
.rest = heading.rest,
|
|
};
|
|
} else if (try p.startCodeBlock(unindented)) |code_block| {
|
|
return .{
|
|
.tag = .code_block,
|
|
.data = .{ .code_block = .{
|
|
.tag = code_block.tag,
|
|
.fence_len = code_block.fence_len,
|
|
.indent = indent,
|
|
} },
|
|
.rest = "",
|
|
};
|
|
} else if (startBlockquote(unindented)) |rest| {
|
|
return .{
|
|
.tag = .blockquote,
|
|
.data = .{ .none = {} },
|
|
.rest = rest,
|
|
};
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
const ListItemStart = struct {
|
|
marker: Block.Data.ListMarker,
|
|
number: u30,
|
|
marker_len: usize,
|
|
rest: []const u8,
|
|
};
|
|
|
|
fn startListItem(unindented_line: []const u8) ?ListItemStart {
|
|
if (mem.startsWith(u8, unindented_line, "- ")) {
|
|
return .{
|
|
.marker = .@"-",
|
|
.number = undefined,
|
|
.marker_len = 2,
|
|
.rest = unindented_line[2..],
|
|
};
|
|
} else if (mem.startsWith(u8, unindented_line, "* ")) {
|
|
return .{
|
|
.marker = .@"*",
|
|
.number = undefined,
|
|
.marker_len = 2,
|
|
.rest = unindented_line[2..],
|
|
};
|
|
} else if (mem.startsWith(u8, unindented_line, "+ ")) {
|
|
return .{
|
|
.marker = .@"+",
|
|
.number = undefined,
|
|
.marker_len = 2,
|
|
.rest = unindented_line[2..],
|
|
};
|
|
}
|
|
|
|
const number_end = mem.indexOfNone(u8, unindented_line, "0123456789") orelse return null;
|
|
const after_number = unindented_line[number_end..];
|
|
const marker: Block.Data.ListMarker = if (mem.startsWith(u8, after_number, ". "))
|
|
.number_dot
|
|
else if (mem.startsWith(u8, after_number, ") "))
|
|
.number_paren
|
|
else
|
|
return null;
|
|
const number = std.fmt.parseInt(u30, unindented_line[0..number_end], 10) catch return null;
|
|
if (number > 999_999_999) return null;
|
|
return .{
|
|
.marker = marker,
|
|
.number = number,
|
|
.marker_len = number_end + 2,
|
|
.rest = after_number[2..],
|
|
};
|
|
}
|
|
|
|
const TableRowStart = struct {
|
|
cells: std.BoundedArray([]const u8, max_table_columns),
|
|
};
|
|
|
|
fn startTableRow(unindented_line: []const u8) ?TableRowStart {
|
|
if (unindented_line.len < 2 or
|
|
!mem.startsWith(u8, unindented_line, "|") or
|
|
mem.endsWith(u8, unindented_line, "\\|") or
|
|
!mem.endsWith(u8, unindented_line, "|")) return null;
|
|
|
|
var cells: std.BoundedArray([]const u8, max_table_columns) = .{};
|
|
const table_row_content = unindented_line[1 .. unindented_line.len - 1];
|
|
var cell_start: usize = 0;
|
|
var i: usize = 0;
|
|
while (i < table_row_content.len) : (i += 1) {
|
|
switch (table_row_content[i]) {
|
|
'\\' => i += 1,
|
|
'|' => {
|
|
cells.append(table_row_content[cell_start..i]) catch return null;
|
|
cell_start = i + 1;
|
|
},
|
|
'`' => {
|
|
// Ignoring pipes in code spans allows table cells to contain
|
|
// code using ||, for example.
|
|
const open_start = i;
|
|
i = mem.indexOfNonePos(u8, table_row_content, i, "`") orelse return null;
|
|
const open_len = i - open_start;
|
|
while (mem.indexOfScalarPos(u8, table_row_content, i, '`')) |close_start| {
|
|
i = mem.indexOfNonePos(u8, table_row_content, close_start, "`") orelse return null;
|
|
const close_len = i - close_start;
|
|
if (close_len == open_len) break;
|
|
} else return null;
|
|
},
|
|
else => {},
|
|
}
|
|
}
|
|
cells.append(table_row_content[cell_start..]) catch return null;
|
|
|
|
return .{ .cells = cells };
|
|
}
|
|
|
|
fn parseTableHeaderDelimiter(
|
|
row_cells: std.BoundedArray([]const u8, max_table_columns),
|
|
) ?std.BoundedArray(Node.TableCellAlignment, max_table_columns) {
|
|
var alignments: std.BoundedArray(Node.TableCellAlignment, max_table_columns) = .{};
|
|
for (row_cells.slice()) |content| {
|
|
const alignment = parseTableHeaderDelimiterCell(content) orelse return null;
|
|
alignments.appendAssumeCapacity(alignment);
|
|
}
|
|
return alignments;
|
|
}
|
|
|
|
fn parseTableHeaderDelimiterCell(content: []const u8) ?Node.TableCellAlignment {
|
|
var state: enum {
|
|
before_rule,
|
|
after_left_anchor,
|
|
in_rule,
|
|
after_right_anchor,
|
|
after_rule,
|
|
} = .before_rule;
|
|
var left_anchor = false;
|
|
var right_anchor = false;
|
|
for (content) |c| {
|
|
switch (state) {
|
|
.before_rule => switch (c) {
|
|
' ' => {},
|
|
':' => {
|
|
left_anchor = true;
|
|
state = .after_left_anchor;
|
|
},
|
|
'-' => state = .in_rule,
|
|
else => return null,
|
|
},
|
|
.after_left_anchor => switch (c) {
|
|
'-' => state = .in_rule,
|
|
else => return null,
|
|
},
|
|
.in_rule => switch (c) {
|
|
'-' => {},
|
|
':' => {
|
|
right_anchor = true;
|
|
state = .after_right_anchor;
|
|
},
|
|
' ' => state = .after_rule,
|
|
else => return null,
|
|
},
|
|
.after_right_anchor => switch (c) {
|
|
' ' => state = .after_rule,
|
|
else => return null,
|
|
},
|
|
.after_rule => switch (c) {
|
|
' ' => {},
|
|
else => return null,
|
|
},
|
|
}
|
|
}
|
|
|
|
switch (state) {
|
|
.before_rule,
|
|
.after_left_anchor,
|
|
=> return null,
|
|
|
|
.in_rule,
|
|
.after_right_anchor,
|
|
.after_rule,
|
|
=> {},
|
|
}
|
|
|
|
return if (left_anchor and right_anchor)
|
|
.center
|
|
else if (left_anchor)
|
|
.left
|
|
else if (right_anchor)
|
|
.right
|
|
else
|
|
.unset;
|
|
}
|
|
|
|
test parseTableHeaderDelimiterCell {
|
|
try expectEqual(null, parseTableHeaderDelimiterCell(""));
|
|
try expectEqual(null, parseTableHeaderDelimiterCell(" "));
|
|
try expectEqual(.unset, parseTableHeaderDelimiterCell("-"));
|
|
try expectEqual(.unset, parseTableHeaderDelimiterCell(" - "));
|
|
try expectEqual(.unset, parseTableHeaderDelimiterCell("----"));
|
|
try expectEqual(.unset, parseTableHeaderDelimiterCell(" ---- "));
|
|
try expectEqual(null, parseTableHeaderDelimiterCell(":"));
|
|
try expectEqual(null, parseTableHeaderDelimiterCell("::"));
|
|
try expectEqual(.left, parseTableHeaderDelimiterCell(":-"));
|
|
try expectEqual(.left, parseTableHeaderDelimiterCell(" :----"));
|
|
try expectEqual(.center, parseTableHeaderDelimiterCell(":-:"));
|
|
try expectEqual(.center, parseTableHeaderDelimiterCell(":----:"));
|
|
try expectEqual(.center, parseTableHeaderDelimiterCell(" :----: "));
|
|
try expectEqual(.right, parseTableHeaderDelimiterCell("-:"));
|
|
try expectEqual(.right, parseTableHeaderDelimiterCell("----:"));
|
|
try expectEqual(.right, parseTableHeaderDelimiterCell(" ----: "));
|
|
}
|
|
|
|
const HeadingStart = struct {
|
|
level: u3,
|
|
rest: []const u8,
|
|
};
|
|
|
|
fn startHeading(unindented_line: []const u8) ?HeadingStart {
|
|
var level: u3 = 0;
|
|
return for (unindented_line, 0..) |c, i| {
|
|
switch (c) {
|
|
'#' => {
|
|
if (level == 6) break null;
|
|
level += 1;
|
|
},
|
|
' ' => {
|
|
// We must have seen at least one # by this point, since
|
|
// unindented_line has no leading spaces.
|
|
assert(level > 0);
|
|
break .{
|
|
.level = level,
|
|
.rest = unindented_line[i + 1 ..],
|
|
};
|
|
},
|
|
else => break null,
|
|
}
|
|
} else null;
|
|
}
|
|
|
|
const CodeBlockStart = struct {
|
|
tag: StringIndex,
|
|
fence_len: usize,
|
|
};
|
|
|
|
fn startCodeBlock(p: *Parser, unindented_line: []const u8) !?CodeBlockStart {
|
|
var fence_len: usize = 0;
|
|
const tag_bytes = for (unindented_line, 0..) |c, i| {
|
|
switch (c) {
|
|
'`' => fence_len += 1,
|
|
else => break unindented_line[i..],
|
|
}
|
|
} else "";
|
|
// Code block tags may not contain backticks, since that would create
|
|
// potential confusion with inline code spans.
|
|
if (fence_len < 3 or mem.indexOfScalar(u8, tag_bytes, '`') != null) return null;
|
|
return .{
|
|
.tag = try p.addString(mem.trim(u8, tag_bytes, " ")),
|
|
.fence_len = fence_len,
|
|
};
|
|
}
|
|
|
|
fn startBlockquote(unindented_line: []const u8) ?[]const u8 {
|
|
return if (mem.startsWith(u8, unindented_line, ">"))
|
|
unindented_line[1..]
|
|
else
|
|
null;
|
|
}
|
|
|
|
fn isThematicBreak(line: []const u8) bool {
|
|
var char: ?u8 = null;
|
|
var count: usize = 0;
|
|
for (line) |c| {
|
|
switch (c) {
|
|
' ' => {},
|
|
'-', '_', '*' => {
|
|
if (char != null and c != char.?) return false;
|
|
char = c;
|
|
count += 1;
|
|
},
|
|
else => return false,
|
|
}
|
|
}
|
|
return count >= 3;
|
|
}
|
|
|
|
fn closeLastBlock(p: *Parser) !void {
|
|
const b = p.pending_blocks.pop().?;
|
|
const node = switch (b.tag) {
|
|
.list => list: {
|
|
assert(b.string_start == p.scratch_string.items.len);
|
|
|
|
// Although tightness is parsed as a property of the list, it is
|
|
// stored at the list item level to make it possible to render each
|
|
// node without any context from its parents.
|
|
const list_items = p.scratch_extra.items[b.extra_start..];
|
|
const node_datas = p.nodes.items(.data);
|
|
if (!b.data.list.tight) {
|
|
for (list_items) |list_item| {
|
|
node_datas[list_item].list_item.tight = false;
|
|
}
|
|
}
|
|
|
|
const children = try p.addExtraChildren(@ptrCast(list_items));
|
|
break :list try p.addNode(.{
|
|
.tag = .list,
|
|
.data = .{ .list = .{
|
|
.start = switch (b.data.list.marker) {
|
|
.number_dot, .number_paren => @enumFromInt(b.data.list.start),
|
|
.@"-", .@"*", .@"+" => .unordered,
|
|
},
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.list_item => list_item: {
|
|
assert(b.string_start == p.scratch_string.items.len);
|
|
const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
|
|
break :list_item try p.addNode(.{
|
|
.tag = .list_item,
|
|
.data = .{ .list_item = .{
|
|
.tight = true,
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.table => table: {
|
|
assert(b.string_start == p.scratch_string.items.len);
|
|
const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
|
|
break :table try p.addNode(.{
|
|
.tag = .table,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.table_row => table_row: {
|
|
assert(b.string_start == p.scratch_string.items.len);
|
|
const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
|
|
break :table_row try p.addNode(.{
|
|
.tag = .table_row,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.heading => heading: {
|
|
const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
|
|
break :heading try p.addNode(.{
|
|
.tag = .heading,
|
|
.data = .{ .heading = .{
|
|
.level = b.data.heading.level,
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.code_block => code_block: {
|
|
const content = try p.addString(p.scratch_string.items[b.string_start..]);
|
|
break :code_block try p.addNode(.{
|
|
.tag = .code_block,
|
|
.data = .{ .code_block = .{
|
|
.tag = b.data.code_block.tag,
|
|
.content = content,
|
|
} },
|
|
});
|
|
},
|
|
.blockquote => blockquote: {
|
|
assert(b.string_start == p.scratch_string.items.len);
|
|
const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
|
|
break :blockquote try p.addNode(.{
|
|
.tag = .blockquote,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.paragraph => paragraph: {
|
|
const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
|
|
break :paragraph try p.addNode(.{
|
|
.tag = .paragraph,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
});
|
|
},
|
|
.thematic_break => try p.addNode(.{
|
|
.tag = .thematic_break,
|
|
.data = .{ .none = {} },
|
|
}),
|
|
};
|
|
p.scratch_string.items.len = b.string_start;
|
|
p.scratch_extra.items.len = b.extra_start;
|
|
try p.addScratchExtraNode(node);
|
|
}
|
|
|
|
const InlineParser = struct {
|
|
parent: *Parser,
|
|
content: []const u8,
|
|
pos: usize = 0,
|
|
pending_inlines: std.ArrayListUnmanaged(PendingInline) = .empty,
|
|
completed_inlines: std.ArrayListUnmanaged(CompletedInline) = .empty,
|
|
|
|
const PendingInline = struct {
|
|
tag: Tag,
|
|
data: Data,
|
|
start: usize,
|
|
|
|
const Tag = enum {
|
|
/// Data is `emphasis`.
|
|
emphasis,
|
|
/// Data is `none`.
|
|
link,
|
|
/// Data is `none`.
|
|
image,
|
|
};
|
|
|
|
const Data = union {
|
|
none: void,
|
|
emphasis: struct {
|
|
underscore: bool,
|
|
run_len: usize,
|
|
},
|
|
};
|
|
};
|
|
|
|
const CompletedInline = struct {
|
|
node: Node.Index,
|
|
start: usize,
|
|
len: usize,
|
|
};
|
|
|
|
fn deinit(ip: *InlineParser) void {
|
|
ip.pending_inlines.deinit(ip.parent.allocator);
|
|
ip.completed_inlines.deinit(ip.parent.allocator);
|
|
}
|
|
|
|
/// Parses all of `ip.content`, returning the children of the node
|
|
/// containing the inline content.
|
|
fn parse(ip: *InlineParser) Allocator.Error!ExtraIndex {
|
|
while (ip.pos < ip.content.len) : (ip.pos += 1) {
|
|
switch (ip.content[ip.pos]) {
|
|
'\\' => ip.pos += 1,
|
|
'[' => try ip.pending_inlines.append(ip.parent.allocator, .{
|
|
.tag = .link,
|
|
.data = .{ .none = {} },
|
|
.start = ip.pos,
|
|
}),
|
|
'!' => if (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == '[') {
|
|
try ip.pending_inlines.append(ip.parent.allocator, .{
|
|
.tag = .image,
|
|
.data = .{ .none = {} },
|
|
.start = ip.pos,
|
|
});
|
|
ip.pos += 1;
|
|
},
|
|
']' => try ip.parseLink(),
|
|
'<' => try ip.parseAutolink(),
|
|
'*', '_' => try ip.parseEmphasis(),
|
|
'`' => try ip.parseCodeSpan(),
|
|
'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) {
|
|
try ip.parseTextAutolink();
|
|
},
|
|
else => {},
|
|
}
|
|
}
|
|
|
|
const children = try ip.encodeChildren(0, ip.content.len);
|
|
// There may be pending inlines after parsing (e.g. unclosed emphasis
|
|
// runs), but there must not be any completed inlines, since those
|
|
// should all be part of `children`.
|
|
assert(ip.completed_inlines.items.len == 0);
|
|
return children;
|
|
}
|
|
|
|
/// Parses a link, starting at the `]` at the end of the link text. `ip.pos`
|
|
/// is left at the closing `)` of the link target or at the closing `]` if
|
|
/// there is none.
|
|
fn parseLink(ip: *InlineParser) !void {
|
|
var i = ip.pending_inlines.items.len;
|
|
while (i > 0) {
|
|
i -= 1;
|
|
if (ip.pending_inlines.items[i].tag == .link or
|
|
ip.pending_inlines.items[i].tag == .image) break;
|
|
} else return;
|
|
const opener = ip.pending_inlines.items[i];
|
|
ip.pending_inlines.shrinkRetainingCapacity(i);
|
|
const text_start = switch (opener.tag) {
|
|
.link => opener.start + 1,
|
|
.image => opener.start + 2,
|
|
else => unreachable,
|
|
};
|
|
|
|
if (ip.pos + 1 >= ip.content.len or ip.content[ip.pos + 1] != '(') return;
|
|
const text_end = ip.pos;
|
|
|
|
const target_start = text_end + 2;
|
|
var target_end = target_start;
|
|
var nesting_level: usize = 1;
|
|
while (target_end < ip.content.len) : (target_end += 1) {
|
|
switch (ip.content[target_end]) {
|
|
'\\' => target_end += 1,
|
|
'(' => nesting_level += 1,
|
|
')' => {
|
|
if (nesting_level == 1) break;
|
|
nesting_level -= 1;
|
|
},
|
|
else => {},
|
|
}
|
|
} else return;
|
|
ip.pos = target_end;
|
|
|
|
const children = try ip.encodeChildren(text_start, text_end);
|
|
const target = try ip.encodeLinkTarget(target_start, target_end);
|
|
|
|
const link = try ip.parent.addNode(.{
|
|
.tag = switch (opener.tag) {
|
|
.link => .link,
|
|
.image => .image,
|
|
else => unreachable,
|
|
},
|
|
.data = .{ .link = .{
|
|
.target = target,
|
|
.children = children,
|
|
} },
|
|
});
|
|
try ip.completed_inlines.append(ip.parent.allocator, .{
|
|
.node = link,
|
|
.start = opener.start,
|
|
.len = ip.pos - opener.start + 1,
|
|
});
|
|
}
|
|
|
|
fn encodeLinkTarget(ip: *InlineParser, start: usize, end: usize) !StringIndex {
|
|
// For efficiency, we can encode directly into string_bytes rather than
|
|
// creating a temporary string and then encoding it, since this process
|
|
// is entirely linear.
|
|
const string_top = ip.parent.string_bytes.items.len;
|
|
errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
|
|
|
|
var text_iter: TextIterator = .{ .content = ip.content[start..end] };
|
|
while (text_iter.next()) |content| {
|
|
switch (content) {
|
|
.char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
|
|
.text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
|
|
.line_break => try ip.parent.string_bytes.appendSlice(ip.parent.allocator, "\\\n"),
|
|
}
|
|
}
|
|
try ip.parent.string_bytes.append(ip.parent.allocator, 0);
|
|
return @enumFromInt(string_top);
|
|
}
|
|
|
|
/// Parses an autolink, starting at the opening `<`. `ip.pos` is left at the
|
|
/// closing `>`, or remains unchanged at the opening `<` if there is none.
|
|
fn parseAutolink(ip: *InlineParser) !void {
|
|
const start = ip.pos;
|
|
ip.pos += 1;
|
|
var state: enum {
|
|
start,
|
|
scheme,
|
|
target,
|
|
} = .start;
|
|
while (ip.pos < ip.content.len) : (ip.pos += 1) {
|
|
switch (state) {
|
|
.start => switch (ip.content[ip.pos]) {
|
|
'A'...'Z', 'a'...'z' => state = .scheme,
|
|
else => break,
|
|
},
|
|
.scheme => switch (ip.content[ip.pos]) {
|
|
'A'...'Z', 'a'...'z', '0'...'9', '+', '.', '-' => {},
|
|
':' => state = .target,
|
|
else => break,
|
|
},
|
|
.target => switch (ip.content[ip.pos]) {
|
|
'<', ' ', '\t', '\n' => break, // Not allowed in autolinks
|
|
'>' => {
|
|
// Backslash escapes are not recognized in autolink targets.
|
|
const target = try ip.parent.addString(ip.content[start + 1 .. ip.pos]);
|
|
const node = try ip.parent.addNode(.{
|
|
.tag = .autolink,
|
|
.data = .{ .text = .{
|
|
.content = target,
|
|
} },
|
|
});
|
|
try ip.completed_inlines.append(ip.parent.allocator, .{
|
|
.node = node,
|
|
.start = start,
|
|
.len = ip.pos - start + 1,
|
|
});
|
|
return;
|
|
},
|
|
else => {},
|
|
},
|
|
}
|
|
}
|
|
ip.pos = start;
|
|
}
|
|
|
|
/// Parses a plain text autolink (not delimited by `<>`), starting at the
|
|
/// first character in the link (an `h`). `ip.pos` is left at the last
|
|
/// character of the link, or remains unchanged if there is no valid link.
|
|
fn parseTextAutolink(ip: *InlineParser) !void {
|
|
const start = ip.pos;
|
|
var state: union(enum) {
|
|
/// Inside `http`. Contains the rest of the text to be matched.
|
|
http: []const u8,
|
|
after_http,
|
|
after_https,
|
|
/// Inside `://`. Contains the rest of the text to be matched.
|
|
authority: []const u8,
|
|
/// Inside link content.
|
|
content: struct {
|
|
start: usize,
|
|
paren_nesting: usize,
|
|
},
|
|
} = .{ .http = "http" };
|
|
|
|
while (ip.pos < ip.content.len) : (ip.pos += 1) {
|
|
switch (state) {
|
|
.http => |rest| {
|
|
if (ip.content[ip.pos] != rest[0]) break;
|
|
if (rest.len > 1) {
|
|
state = .{ .http = rest[1..] };
|
|
} else {
|
|
state = .after_http;
|
|
}
|
|
},
|
|
.after_http => switch (ip.content[ip.pos]) {
|
|
's' => state = .after_https,
|
|
':' => state = .{ .authority = "//" },
|
|
else => break,
|
|
},
|
|
.after_https => switch (ip.content[ip.pos]) {
|
|
':' => state = .{ .authority = "//" },
|
|
else => break,
|
|
},
|
|
.authority => |rest| {
|
|
if (ip.content[ip.pos] != rest[0]) break;
|
|
if (rest.len > 1) {
|
|
state = .{ .authority = rest[1..] };
|
|
} else {
|
|
state = .{ .content = .{
|
|
.start = ip.pos + 1,
|
|
.paren_nesting = 0,
|
|
} };
|
|
}
|
|
},
|
|
.content => |*content| switch (ip.content[ip.pos]) {
|
|
' ', '\t', '\n' => break,
|
|
'(' => content.paren_nesting += 1,
|
|
')' => if (content.paren_nesting == 0) {
|
|
break;
|
|
} else {
|
|
content.paren_nesting -= 1;
|
|
},
|
|
else => {},
|
|
},
|
|
}
|
|
}
|
|
|
|
switch (state) {
|
|
.http, .after_http, .after_https, .authority => {
|
|
ip.pos = start;
|
|
},
|
|
.content => |content| {
|
|
while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) {
|
|
ip.pos -= 1;
|
|
}
|
|
if (ip.pos == content.start) {
|
|
ip.pos = start;
|
|
return;
|
|
}
|
|
|
|
const target = try ip.parent.addString(ip.content[start..ip.pos]);
|
|
const node = try ip.parent.addNode(.{
|
|
.tag = .autolink,
|
|
.data = .{ .text = .{
|
|
.content = target,
|
|
} },
|
|
});
|
|
try ip.completed_inlines.append(ip.parent.allocator, .{
|
|
.node = node,
|
|
.start = start,
|
|
.len = ip.pos - start,
|
|
});
|
|
ip.pos -= 1;
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Returns whether `c` may appear before a text autolink is recognized.
|
|
fn isPreTextAutolink(c: u8) bool {
|
|
return switch (c) {
|
|
' ', '\t', '\n', '*', '_', '(' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
/// Returns whether `c` is punctuation that may appear after a text autolink
|
|
/// and not be considered part of it.
|
|
fn isPostTextAutolink(c: u8) bool {
|
|
return switch (c) {
|
|
'?', '!', '.', ',', ':', '*', '_' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
/// Parses emphasis, starting at the beginning of a run of `*` or `_`
|
|
/// characters. `ip.pos` is left at the last character in the run after
|
|
/// parsing.
|
|
fn parseEmphasis(ip: *InlineParser) !void {
|
|
const char = ip.content[ip.pos];
|
|
var start = ip.pos;
|
|
while (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == char) {
|
|
ip.pos += 1;
|
|
}
|
|
var len = ip.pos - start + 1;
|
|
const underscore = char == '_';
|
|
const space_before = start == 0 or isWhitespace(ip.content[start - 1]);
|
|
const space_after = start + len == ip.content.len or isWhitespace(ip.content[start + len]);
|
|
const punct_before = start == 0 or isPunctuation(ip.content[start - 1]);
|
|
const punct_after = start + len == ip.content.len or isPunctuation(ip.content[start + len]);
|
|
// The rules for when emphasis may be closed or opened are stricter for
|
|
// underscores to avoid inappropriately interpreting snake_case words as
|
|
// containing emphasis markers.
|
|
const can_open = if (underscore)
|
|
!space_after and (space_before or punct_before)
|
|
else
|
|
!space_after;
|
|
const can_close = if (underscore)
|
|
!space_before and (space_after or punct_after)
|
|
else
|
|
!space_before;
|
|
|
|
if (can_close and ip.pending_inlines.items.len > 0) {
|
|
var i = ip.pending_inlines.items.len;
|
|
while (i > 0 and len > 0) {
|
|
i -= 1;
|
|
const opener = &ip.pending_inlines.items[i];
|
|
if (opener.tag != .emphasis or
|
|
opener.data.emphasis.underscore != underscore) continue;
|
|
|
|
const close_len = @min(opener.data.emphasis.run_len, len);
|
|
const opener_end = opener.start + opener.data.emphasis.run_len;
|
|
|
|
const emphasis = try ip.encodeEmphasis(opener_end, start, close_len);
|
|
const emphasis_start = opener_end - close_len;
|
|
const emphasis_len = start - emphasis_start + close_len;
|
|
try ip.completed_inlines.append(ip.parent.allocator, .{
|
|
.node = emphasis,
|
|
.start = emphasis_start,
|
|
.len = emphasis_len,
|
|
});
|
|
|
|
// There may still be other openers further down in the
|
|
// stack to close, or part of this run might serve as an
|
|
// opener itself.
|
|
start += close_len;
|
|
len -= close_len;
|
|
|
|
// Remove any pending inlines above this on the stack, since
|
|
// closing this emphasis will prevent them from being closed.
|
|
// Additionally, if this opener is completely consumed by
|
|
// being closed, it can be removed.
|
|
opener.data.emphasis.run_len -= close_len;
|
|
if (opener.data.emphasis.run_len == 0) {
|
|
ip.pending_inlines.shrinkRetainingCapacity(i);
|
|
} else {
|
|
ip.pending_inlines.shrinkRetainingCapacity(i + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (can_open and len > 0) {
|
|
try ip.pending_inlines.append(ip.parent.allocator, .{
|
|
.tag = .emphasis,
|
|
.data = .{ .emphasis = .{
|
|
.underscore = underscore,
|
|
.run_len = len,
|
|
} },
|
|
.start = start,
|
|
});
|
|
}
|
|
}
|
|
|
|
/// Encodes emphasis specified by a run of `run_len` emphasis characters,
|
|
/// with `start..end` being the range of content contained within the
|
|
/// emphasis.
|
|
fn encodeEmphasis(ip: *InlineParser, start: usize, end: usize, run_len: usize) !Node.Index {
|
|
const children = try ip.encodeChildren(start, end);
|
|
var inner = switch (run_len % 3) {
|
|
1 => try ip.parent.addNode(.{
|
|
.tag = .emphasis,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
}),
|
|
2 => try ip.parent.addNode(.{
|
|
.tag = .strong,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
}),
|
|
0 => strong_emphasis: {
|
|
const strong = try ip.parent.addNode(.{
|
|
.tag = .strong,
|
|
.data = .{ .container = .{
|
|
.children = children,
|
|
} },
|
|
});
|
|
break :strong_emphasis try ip.parent.addNode(.{
|
|
.tag = .emphasis,
|
|
.data = .{ .container = .{
|
|
.children = try ip.parent.addExtraChildren(&.{strong}),
|
|
} },
|
|
});
|
|
},
|
|
else => unreachable,
|
|
};
|
|
|
|
var run_left = run_len;
|
|
while (run_left > 3) : (run_left -= 3) {
|
|
const strong = try ip.parent.addNode(.{
|
|
.tag = .strong,
|
|
.data = .{ .container = .{
|
|
.children = try ip.parent.addExtraChildren(&.{inner}),
|
|
} },
|
|
});
|
|
inner = try ip.parent.addNode(.{
|
|
.tag = .emphasis,
|
|
.data = .{ .container = .{
|
|
.children = try ip.parent.addExtraChildren(&.{strong}),
|
|
} },
|
|
});
|
|
}
|
|
|
|
return inner;
|
|
}
|
|
|
|
/// Parses a code span, starting at the beginning of the opening backtick
|
|
/// run. `ip.pos` is left at the last character in the closing run after
|
|
/// parsing.
|
|
fn parseCodeSpan(ip: *InlineParser) !void {
|
|
const opener_start = ip.pos;
|
|
ip.pos = mem.indexOfNonePos(u8, ip.content, ip.pos, "`") orelse ip.content.len;
|
|
const opener_len = ip.pos - opener_start;
|
|
|
|
const start = ip.pos;
|
|
const end = while (mem.indexOfScalarPos(u8, ip.content, ip.pos, '`')) |closer_start| {
|
|
ip.pos = mem.indexOfNonePos(u8, ip.content, closer_start, "`") orelse ip.content.len;
|
|
const closer_len = ip.pos - closer_start;
|
|
|
|
if (closer_len == opener_len) break closer_start;
|
|
} else unterminated: {
|
|
ip.pos = ip.content.len;
|
|
break :unterminated ip.content.len;
|
|
};
|
|
|
|
var content = if (start < ip.content.len)
|
|
ip.content[start..end]
|
|
else
|
|
"";
|
|
// This single space removal rule allows code spans to be written which
|
|
// start or end with backticks.
|
|
if (mem.startsWith(u8, content, " `")) content = content[1..];
|
|
if (mem.endsWith(u8, content, "` ")) content = content[0 .. content.len - 1];
|
|
|
|
const text = try ip.parent.addNode(.{
|
|
.tag = .code_span,
|
|
.data = .{ .text = .{
|
|
.content = try ip.parent.addString(content),
|
|
} },
|
|
});
|
|
try ip.completed_inlines.append(ip.parent.allocator, .{
|
|
.node = text,
|
|
.start = opener_start,
|
|
.len = ip.pos - opener_start,
|
|
});
|
|
// Ensure ip.pos is pointing at the last character of the
|
|
// closer, not after it.
|
|
ip.pos -= 1;
|
|
}
|
|
|
|
/// Encodes children parsed in the content range `start..end`. The children
|
|
/// will be text nodes and any completed inlines within the range.
|
|
fn encodeChildren(ip: *InlineParser, start: usize, end: usize) !ExtraIndex {
|
|
const scratch_extra_top = ip.parent.scratch_extra.items.len;
|
|
defer ip.parent.scratch_extra.shrinkRetainingCapacity(scratch_extra_top);
|
|
|
|
var child_index = ip.completed_inlines.items.len;
|
|
while (child_index > 0 and ip.completed_inlines.items[child_index - 1].start >= start) {
|
|
child_index -= 1;
|
|
}
|
|
const start_child_index = child_index;
|
|
|
|
var pos = start;
|
|
while (child_index < ip.completed_inlines.items.len) : (child_index += 1) {
|
|
const child_inline = ip.completed_inlines.items[child_index];
|
|
// Completed inlines must be strictly nested within the encodable
|
|
// content.
|
|
assert(child_inline.start >= pos and child_inline.start + child_inline.len <= end);
|
|
|
|
if (child_inline.start > pos) {
|
|
try ip.encodeTextNode(pos, child_inline.start);
|
|
}
|
|
try ip.parent.addScratchExtraNode(child_inline.node);
|
|
|
|
pos = child_inline.start + child_inline.len;
|
|
}
|
|
ip.completed_inlines.shrinkRetainingCapacity(start_child_index);
|
|
|
|
if (pos < end) {
|
|
try ip.encodeTextNode(pos, end);
|
|
}
|
|
|
|
const children = ip.parent.scratch_extra.items[scratch_extra_top..];
|
|
return try ip.parent.addExtraChildren(@ptrCast(children));
|
|
}
|
|
|
|
/// Encodes textual content `ip.content[start..end]` to `scratch_extra`. The
|
|
/// encoded content may include both `text` and `line_break` nodes.
|
|
fn encodeTextNode(ip: *InlineParser, start: usize, end: usize) !void {
|
|
// For efficiency, we can encode directly into string_bytes rather than
|
|
// creating a temporary string and then encoding it, since this process
|
|
// is entirely linear.
|
|
const string_top = ip.parent.string_bytes.items.len;
|
|
errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
|
|
|
|
var string_start = string_top;
|
|
var text_iter: TextIterator = .{ .content = ip.content[start..end] };
|
|
while (text_iter.next()) |content| {
|
|
switch (content) {
|
|
.char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
|
|
.text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
|
|
.line_break => {
|
|
if (ip.parent.string_bytes.items.len > string_start) {
|
|
try ip.parent.string_bytes.append(ip.parent.allocator, 0);
|
|
try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
|
|
.tag = .text,
|
|
.data = .{ .text = .{
|
|
.content = @enumFromInt(string_start),
|
|
} },
|
|
}));
|
|
string_start = ip.parent.string_bytes.items.len;
|
|
}
|
|
try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
|
|
.tag = .line_break,
|
|
.data = .{ .none = {} },
|
|
}));
|
|
},
|
|
}
|
|
}
|
|
if (ip.parent.string_bytes.items.len > string_start) {
|
|
try ip.parent.string_bytes.append(ip.parent.allocator, 0);
|
|
try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
|
|
.tag = .text,
|
|
.data = .{ .text = .{
|
|
.content = @enumFromInt(string_start),
|
|
} },
|
|
}));
|
|
}
|
|
}
|
|
|
|
/// An iterator over parts of textual content, handling unescaping of
|
|
/// escaped characters and line breaks.
|
|
const TextIterator = struct {
|
|
content: []const u8,
|
|
pos: usize = 0,
|
|
|
|
const Content = union(enum) {
|
|
char: u8,
|
|
text: []const u8,
|
|
line_break,
|
|
};
|
|
|
|
const replacement = "\u{FFFD}";
|
|
|
|
fn next(iter: *TextIterator) ?Content {
|
|
if (iter.pos >= iter.content.len) return null;
|
|
if (iter.content[iter.pos] == '\\') {
|
|
iter.pos += 1;
|
|
if (iter.pos == iter.content.len) {
|
|
return .{ .char = '\\' };
|
|
} else if (iter.content[iter.pos] == '\n') {
|
|
iter.pos += 1;
|
|
return .line_break;
|
|
} else if (isPunctuation(iter.content[iter.pos])) {
|
|
const c = iter.content[iter.pos];
|
|
iter.pos += 1;
|
|
return .{ .char = c };
|
|
} else {
|
|
return .{ .char = '\\' };
|
|
}
|
|
}
|
|
return iter.nextCodepoint();
|
|
}
|
|
|
|
fn nextCodepoint(iter: *TextIterator) ?Content {
|
|
switch (iter.content[iter.pos]) {
|
|
0 => {
|
|
iter.pos += 1;
|
|
return .{ .text = replacement };
|
|
},
|
|
1...127 => |c| {
|
|
iter.pos += 1;
|
|
return .{ .char = c };
|
|
},
|
|
else => |b| {
|
|
const cp_len = std.unicode.utf8ByteSequenceLength(b) catch {
|
|
iter.pos += 1;
|
|
return .{ .text = replacement };
|
|
};
|
|
const is_valid = iter.pos + cp_len <= iter.content.len and
|
|
std.unicode.utf8ValidateSlice(iter.content[iter.pos..][0..cp_len]);
|
|
const cp_encoded = if (is_valid)
|
|
iter.content[iter.pos..][0..cp_len]
|
|
else
|
|
replacement;
|
|
iter.pos += cp_len;
|
|
return .{ .text = cp_encoded };
|
|
},
|
|
}
|
|
}
|
|
};
|
|
};
|
|
|
|
fn parseInlines(p: *Parser, content: []const u8) !ExtraIndex {
|
|
var ip: InlineParser = .{
|
|
.parent = p,
|
|
.content = mem.trim(u8, content, " \t\n"),
|
|
};
|
|
defer ip.deinit();
|
|
return try ip.parse();
|
|
}
|
|
|
|
pub fn extraData(p: Parser, comptime T: type, index: ExtraIndex) ExtraData(T) {
|
|
const fields = @typeInfo(T).@"struct".fields;
|
|
var i: usize = @intFromEnum(index);
|
|
var result: T = undefined;
|
|
inline for (fields) |field| {
|
|
@field(result, field.name) = switch (field.type) {
|
|
u32 => p.extra.items[i],
|
|
else => @compileError("bad field type"),
|
|
};
|
|
i += 1;
|
|
}
|
|
return .{ .data = result, .end = i };
|
|
}
|
|
|
|
pub fn extraChildren(p: Parser, index: ExtraIndex) []const Node.Index {
|
|
const children = p.extraData(Node.Children, index);
|
|
return @ptrCast(p.extra.items[children.end..][0..children.data.len]);
|
|
}
|
|
|
|
fn addNode(p: *Parser, node: Node) !Node.Index {
|
|
const index: Node.Index = @enumFromInt(@as(u32, @intCast(p.nodes.len)));
|
|
try p.nodes.append(p.allocator, node);
|
|
return index;
|
|
}
|
|
|
|
fn addString(p: *Parser, s: []const u8) !StringIndex {
|
|
if (s.len == 0) return .empty;
|
|
|
|
const index: StringIndex = @enumFromInt(@as(u32, @intCast(p.string_bytes.items.len)));
|
|
try p.string_bytes.ensureUnusedCapacity(p.allocator, s.len + 1);
|
|
p.string_bytes.appendSliceAssumeCapacity(s);
|
|
p.string_bytes.appendAssumeCapacity(0);
|
|
return index;
|
|
}
|
|
|
|
fn addExtraChildren(p: *Parser, nodes: []const Node.Index) !ExtraIndex {
|
|
const index: ExtraIndex = @enumFromInt(@as(u32, @intCast(p.extra.items.len)));
|
|
try p.extra.ensureUnusedCapacity(p.allocator, nodes.len + 1);
|
|
p.extra.appendAssumeCapacity(@intCast(nodes.len));
|
|
p.extra.appendSliceAssumeCapacity(@ptrCast(nodes));
|
|
return index;
|
|
}
|
|
|
|
fn addScratchExtraNode(p: *Parser, node: Node.Index) !void {
|
|
try p.scratch_extra.append(p.allocator, @intFromEnum(node));
|
|
}
|
|
|
|
fn addScratchStringLine(p: *Parser, line: []const u8) !void {
|
|
try p.scratch_string.ensureUnusedCapacity(p.allocator, line.len + 1);
|
|
p.scratch_string.appendSliceAssumeCapacity(line);
|
|
p.scratch_string.appendAssumeCapacity('\n');
|
|
}
|
|
|
|
fn isBlank(line: []const u8) bool {
|
|
return mem.indexOfNone(u8, line, " \t") == null;
|
|
}
|
|
|
|
fn isPunctuation(c: u8) bool {
|
|
return switch (c) {
|
|
'!',
|
|
'"',
|
|
'#',
|
|
'$',
|
|
'%',
|
|
'&',
|
|
'\'',
|
|
'(',
|
|
')',
|
|
'*',
|
|
'+',
|
|
',',
|
|
'-',
|
|
'.',
|
|
'/',
|
|
':',
|
|
';',
|
|
'<',
|
|
'=',
|
|
'>',
|
|
'?',
|
|
'@',
|
|
'[',
|
|
'\\',
|
|
']',
|
|
'^',
|
|
'_',
|
|
'`',
|
|
'{',
|
|
'|',
|
|
'}',
|
|
'~',
|
|
=> true,
|
|
else => false,
|
|
};
|
|
}
|