//! A Markdown parser producing `Document`s. //! //! The parser operates at two levels: at the outer level, the parser accepts //! the content of an input document line by line and begins building the _block //! structure_ of the document. This creates a stack of currently open blocks. //! //! When the parser detects the end of a block, it closes the block, popping it //! from the open block stack and completing any additional parsing of the //! block's content. For blocks which contain parseable inline content, this //! invokes the inner level of the parser, handling the _inline structure_ of //! the block. //! //! Inline parsing scans through the collected inline content of a block. When //! it encounters a character that could indicate the beginning of an inline, it //! either handles the inline right away (if possible) or adds it to a pending //! inlines stack. When an inline is completed, it is added to a list of //! completed inlines, which (along with any surrounding text nodes) will become //! the children of the parent inline or the block whose inline content is being //! parsed. const std = @import("std"); const mem = std.mem; const assert = std.debug.assert; const isWhitespace = std.ascii.isWhitespace; const Allocator = mem.Allocator; const expectEqual = std.testing.expectEqual; const Document = @import("Document.zig"); const Node = Document.Node; const ExtraIndex = Document.ExtraIndex; const ExtraData = Document.ExtraData; const StringIndex = Document.StringIndex; nodes: Node.List = .{}, extra: std.ArrayListUnmanaged(u32) = .empty, scratch_extra: std.ArrayListUnmanaged(u32) = .empty, string_bytes: std.ArrayListUnmanaged(u8) = .empty, scratch_string: std.ArrayListUnmanaged(u8) = .empty, pending_blocks: std.ArrayListUnmanaged(Block) = .empty, allocator: Allocator, const Parser = @This(); /// An arbitrary limit on the maximum number of columns in a table so that /// table-related metadata maintained by the parser does not require dynamic /// memory allocation. const max_table_columns = 128; /// A block element which is still receiving children. const Block = struct { tag: Tag, data: Data, extra_start: usize, string_start: usize, const Tag = enum { /// Data is `list`. list, /// Data is `list_item`. list_item, /// Data is `table`. table, /// Data is `none`. table_row, /// Data is `heading`. heading, /// Data is `code_block`. code_block, /// Data is `none`. blockquote, /// Data is `none`. paragraph, /// Data is `none`. thematic_break, }; const Data = union { none: void, list: struct { marker: ListMarker, /// Between 0 and 999,999,999, inclusive. start: u30, tight: bool, last_line_blank: bool = false, }, list_item: struct { continuation_indent: usize, }, table: struct { column_alignments: std.BoundedArray(Node.TableCellAlignment, max_table_columns) = .{}, }, heading: struct { /// Between 1 and 6, inclusive. level: u3, }, code_block: struct { tag: StringIndex, fence_len: usize, indent: usize, }, const ListMarker = enum { @"-", @"*", @"+", number_dot, number_paren, }; }; const ContentType = enum { blocks, inlines, raw_inlines, nothing, }; fn canAccept(b: Block) ContentType { return switch (b.tag) { .list, .list_item, .table, .blockquote, => .blocks, .heading, .paragraph, => .inlines, .code_block, => .raw_inlines, .table_row, .thematic_break, => .nothing, }; } /// Attempts to continue `b` using the contents of `line`. If successful, /// returns the remaining portion of `line` to be considered part of `b` /// (e.g. for a blockquote, this would be everything except the leading /// `>`). If unsuccessful, returns null. fn match(b: Block, line: []const u8) ?[]const u8 { const unindented = mem.trimStart(u8, line, " \t"); const indent = line.len - unindented.len; return switch (b.tag) { .list => line, .list_item => if (indent >= b.data.list_item.continuation_indent) line[b.data.list_item.continuation_indent..] else if (unindented.len == 0) // Blank lines should not close list items, since there may be // more indented contents to follow after the blank line. "" else null, .table => if (unindented.len > 0) line else null, .table_row => null, .heading => null, .code_block => code_block: { const trimmed = mem.trimEnd(u8, unindented, " \t"); if (mem.indexOfNone(u8, trimmed, "`") != null or trimmed.len != b.data.code_block.fence_len) { const effective_indent = @min(indent, b.data.code_block.indent); break :code_block line[effective_indent..]; } else { break :code_block null; } }, .blockquote => if (mem.startsWith(u8, unindented, ">")) unindented[1..] else null, .paragraph => if (unindented.len > 0) line else null, .thematic_break => null, }; } }; pub fn init(allocator: Allocator) Allocator.Error!Parser { var p: Parser = .{ .allocator = allocator }; try p.nodes.append(allocator, .{ .tag = .root, .data = undefined, }); try p.string_bytes.append(allocator, 0); return p; } pub fn deinit(p: *Parser) void { p.nodes.deinit(p.allocator); p.extra.deinit(p.allocator); p.scratch_extra.deinit(p.allocator); p.string_bytes.deinit(p.allocator); p.scratch_string.deinit(p.allocator); p.pending_blocks.deinit(p.allocator); p.* = undefined; } /// Accepts a single line of content. `line` should not have a trailing line /// ending character. pub fn feedLine(p: *Parser, line: []const u8) Allocator.Error!void { var rest_line = line; const first_unmatched = for (p.pending_blocks.items, 0..) |b, i| { if (b.match(rest_line)) |rest| { rest_line = rest; } else { break i; } } else p.pending_blocks.items.len; const in_code_block = p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .code_block; const code_block_end = in_code_block and first_unmatched + 1 == p.pending_blocks.items.len; // New blocks cannot be started if we are actively inside a code block or // are just closing one (to avoid interpreting the closing ``` as a new code // block start). var maybe_block_start = if (!in_code_block or first_unmatched + 2 <= p.pending_blocks.items.len) try p.startBlock(rest_line) else null; // This is a lazy continuation line if there are no new blocks to open and // the last open block is a paragraph. if (maybe_block_start == null and !isBlank(rest_line) and p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .paragraph) { try p.addScratchStringLine(mem.trimStart(u8, rest_line, " \t")); return; } // If a new block needs to be started, any paragraph needs to be closed, // even though this isn't detected as part of the closing condition for // paragraphs. if (maybe_block_start != null and p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .paragraph) { try p.closeLastBlock(); } while (p.pending_blocks.items.len > first_unmatched) { try p.closeLastBlock(); } while (maybe_block_start) |block_start| : (maybe_block_start = try p.startBlock(rest_line)) { try p.appendBlockStart(block_start); // There may be more blocks to start within the same line. rest_line = block_start.rest; // Headings may only contain inline content. if (block_start.tag == .heading) break; // An opening code fence does not contain any additional block or inline // content to process. if (block_start.tag == .code_block) return; } // Do not append the end of a code block (```) as textual content. if (code_block_end) return; const can_accept = if (p.pending_blocks.getLastOrNull()) |last_pending_block| last_pending_block.canAccept() else .blocks; const rest_line_trimmed = mem.trimStart(u8, rest_line, " \t"); switch (can_accept) { .blocks => { // If we're inside a list item and the rest of the line is blank, it // means that any subsequent child of the list item (or subsequent // item in the list) will cause the containing list to be considered // loose. However, we can't immediately declare that the list is // loose, since we might just be looking at a blank line after the // end of the last item in the list. The final determination will be // made when appending the next child of the list or list item. const maybe_containing_list_index = if (p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .list_item) p.pending_blocks.items.len - 2 else null; if (rest_line_trimmed.len > 0) { try p.appendBlockStart(.{ .tag = .paragraph, .data = .{ .none = {} }, .rest = undefined, }); try p.addScratchStringLine(rest_line_trimmed); } if (maybe_containing_list_index) |containing_list_index| { p.pending_blocks.items[containing_list_index].data.list.last_line_blank = rest_line_trimmed.len == 0; } }, .inlines => try p.addScratchStringLine(rest_line_trimmed), .raw_inlines => try p.addScratchStringLine(rest_line), .nothing => {}, } } /// Completes processing of the input and returns the parsed document. pub fn endInput(p: *Parser) Allocator.Error!Document { while (p.pending_blocks.items.len > 0) { try p.closeLastBlock(); } // There should be no inline content pending after closing the last open // block. assert(p.scratch_string.items.len == 0); const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items)); p.nodes.items(.data)[0] = .{ .container = .{ .children = children } }; p.scratch_string.items.len = 0; p.scratch_extra.items.len = 0; var nodes = p.nodes.toOwnedSlice(); errdefer nodes.deinit(p.allocator); const extra = try p.extra.toOwnedSlice(p.allocator); errdefer p.allocator.free(extra); const string_bytes = try p.string_bytes.toOwnedSlice(p.allocator); errdefer p.allocator.free(string_bytes); return .{ .nodes = nodes, .extra = extra, .string_bytes = string_bytes, }; } /// Data describing the start of a new block element. const BlockStart = struct { tag: Tag, data: Data, rest: []const u8, const Tag = enum { /// Data is `list_item`. list_item, /// Data is `table_row`. table_row, /// Data is `heading`. heading, /// Data is `code_block`. code_block, /// Data is `none`. blockquote, /// Data is `none`. paragraph, /// Data is `none`. thematic_break, }; const Data = union { none: void, list_item: struct { marker: Block.Data.ListMarker, number: u30, continuation_indent: usize, }, table_row: struct { cells: std.BoundedArray([]const u8, max_table_columns), }, heading: struct { /// Between 1 and 6, inclusive. level: u3, }, code_block: struct { tag: StringIndex, fence_len: usize, indent: usize, }, }; }; fn appendBlockStart(p: *Parser, block_start: BlockStart) !void { if (p.pending_blocks.getLastOrNull()) |last_pending_block| { // Close the last block if it is a list and the new block is not a list item // or not of the same marker type. const should_close_list = last_pending_block.tag == .list and (block_start.tag != .list_item or block_start.data.list_item.marker != last_pending_block.data.list.marker); // The last block should also be closed if the new block is not a table // row, which is the only allowed child of a table. const should_close_table = last_pending_block.tag == .table and block_start.tag != .table_row; if (should_close_list or should_close_table) { try p.closeLastBlock(); } } if (p.pending_blocks.getLastOrNull()) |last_pending_block| { // If the last block is a list or list item, check for tightness based // on the last line. const maybe_containing_list = switch (last_pending_block.tag) { .list => &p.pending_blocks.items[p.pending_blocks.items.len - 1], .list_item => &p.pending_blocks.items[p.pending_blocks.items.len - 2], else => null, }; if (maybe_containing_list) |containing_list| { if (containing_list.data.list.last_line_blank) { containing_list.data.list.tight = false; } } } // Start a new list if the new block is a list item and there is no // containing list yet. if (block_start.tag == .list_item and (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .list)) { try p.pending_blocks.append(p.allocator, .{ .tag = .list, .data = .{ .list = .{ .marker = block_start.data.list_item.marker, .start = block_start.data.list_item.number, .tight = true, } }, .string_start = p.scratch_string.items.len, .extra_start = p.scratch_extra.items.len, }); } if (block_start.tag == .table_row) { // Likewise, table rows start a table implicitly. if (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .table) { try p.pending_blocks.append(p.allocator, .{ .tag = .table, .data = .{ .table = .{ .column_alignments = .{}, } }, .string_start = p.scratch_string.items.len, .extra_start = p.scratch_extra.items.len, }); } const current_row = p.scratch_extra.items.len - p.pending_blocks.getLast().extra_start; if (current_row <= 1) { if (parseTableHeaderDelimiter(block_start.data.table_row.cells)) |alignments| { p.pending_blocks.items[p.pending_blocks.items.len - 1].data.table.column_alignments = alignments; if (current_row == 1) { // We need to go back and mark the header row and its column // alignments. const datas = p.nodes.items(.data); const header_data = datas[p.scratch_extra.getLast()]; for (p.extraChildren(header_data.container.children), 0..) |header_cell, i| { const alignment = if (i < alignments.len) alignments.buffer[i] else .unset; const cell_data = &datas[@intFromEnum(header_cell)].table_cell; cell_data.info.alignment = alignment; cell_data.info.header = true; } } return; } } } const tag: Block.Tag, const data: Block.Data = switch (block_start.tag) { .list_item => .{ .list_item, .{ .list_item = .{ .continuation_indent = block_start.data.list_item.continuation_indent, } } }, .table_row => .{ .table_row, .{ .none = {} } }, .heading => .{ .heading, .{ .heading = .{ .level = block_start.data.heading.level, } } }, .code_block => .{ .code_block, .{ .code_block = .{ .tag = block_start.data.code_block.tag, .fence_len = block_start.data.code_block.fence_len, .indent = block_start.data.code_block.indent, } } }, .blockquote => .{ .blockquote, .{ .none = {} } }, .paragraph => .{ .paragraph, .{ .none = {} } }, .thematic_break => .{ .thematic_break, .{ .none = {} } }, }; try p.pending_blocks.append(p.allocator, .{ .tag = tag, .data = data, .string_start = p.scratch_string.items.len, .extra_start = p.scratch_extra.items.len, }); if (tag == .table_row) { // Table rows are unique, since we already have all the children // available in the BlockStart. We can immediately parse and append // these children now. const containing_table = p.pending_blocks.items[p.pending_blocks.items.len - 2]; const column_alignments = containing_table.data.table.column_alignments.slice(); for (block_start.data.table_row.cells.slice(), 0..) |cell_content, i| { const cell_children = try p.parseInlines(cell_content); const alignment = if (i < column_alignments.len) column_alignments[i] else .unset; const cell = try p.addNode(.{ .tag = .table_cell, .data = .{ .table_cell = .{ .info = .{ .alignment = alignment, .header = false, }, .children = cell_children, } }, }); try p.addScratchExtraNode(cell); } } } fn startBlock(p: *Parser, line: []const u8) !?BlockStart { const unindented = mem.trimStart(u8, line, " \t"); const indent = line.len - unindented.len; if (isThematicBreak(line)) { // Thematic breaks take precedence over list items. return .{ .tag = .thematic_break, .data = .{ .none = {} }, .rest = "", }; } else if (startListItem(unindented)) |list_item| { return .{ .tag = .list_item, .data = .{ .list_item = .{ .marker = list_item.marker, .number = list_item.number, .continuation_indent = indent + list_item.marker_len, } }, .rest = list_item.rest, }; } else if (startTableRow(unindented)) |table_row| { return .{ .tag = .table_row, .data = .{ .table_row = .{ .cells = table_row.cells, } }, .rest = "", }; } else if (startHeading(unindented)) |heading| { return .{ .tag = .heading, .data = .{ .heading = .{ .level = heading.level, } }, .rest = heading.rest, }; } else if (try p.startCodeBlock(unindented)) |code_block| { return .{ .tag = .code_block, .data = .{ .code_block = .{ .tag = code_block.tag, .fence_len = code_block.fence_len, .indent = indent, } }, .rest = "", }; } else if (startBlockquote(unindented)) |rest| { return .{ .tag = .blockquote, .data = .{ .none = {} }, .rest = rest, }; } else { return null; } } const ListItemStart = struct { marker: Block.Data.ListMarker, number: u30, marker_len: usize, rest: []const u8, }; fn startListItem(unindented_line: []const u8) ?ListItemStart { if (mem.startsWith(u8, unindented_line, "- ")) { return .{ .marker = .@"-", .number = undefined, .marker_len = 2, .rest = unindented_line[2..], }; } else if (mem.startsWith(u8, unindented_line, "* ")) { return .{ .marker = .@"*", .number = undefined, .marker_len = 2, .rest = unindented_line[2..], }; } else if (mem.startsWith(u8, unindented_line, "+ ")) { return .{ .marker = .@"+", .number = undefined, .marker_len = 2, .rest = unindented_line[2..], }; } const number_end = mem.indexOfNone(u8, unindented_line, "0123456789") orelse return null; const after_number = unindented_line[number_end..]; const marker: Block.Data.ListMarker = if (mem.startsWith(u8, after_number, ". ")) .number_dot else if (mem.startsWith(u8, after_number, ") ")) .number_paren else return null; const number = std.fmt.parseInt(u30, unindented_line[0..number_end], 10) catch return null; if (number > 999_999_999) return null; return .{ .marker = marker, .number = number, .marker_len = number_end + 2, .rest = after_number[2..], }; } const TableRowStart = struct { cells: std.BoundedArray([]const u8, max_table_columns), }; fn startTableRow(unindented_line: []const u8) ?TableRowStart { if (unindented_line.len < 2 or !mem.startsWith(u8, unindented_line, "|") or mem.endsWith(u8, unindented_line, "\\|") or !mem.endsWith(u8, unindented_line, "|")) return null; var cells: std.BoundedArray([]const u8, max_table_columns) = .{}; const table_row_content = unindented_line[1 .. unindented_line.len - 1]; var cell_start: usize = 0; var i: usize = 0; while (i < table_row_content.len) : (i += 1) { switch (table_row_content[i]) { '\\' => i += 1, '|' => { cells.append(table_row_content[cell_start..i]) catch return null; cell_start = i + 1; }, '`' => { // Ignoring pipes in code spans allows table cells to contain // code using ||, for example. const open_start = i; i = mem.indexOfNonePos(u8, table_row_content, i, "`") orelse return null; const open_len = i - open_start; while (mem.indexOfScalarPos(u8, table_row_content, i, '`')) |close_start| { i = mem.indexOfNonePos(u8, table_row_content, close_start, "`") orelse return null; const close_len = i - close_start; if (close_len == open_len) break; } else return null; }, else => {}, } } cells.append(table_row_content[cell_start..]) catch return null; return .{ .cells = cells }; } fn parseTableHeaderDelimiter( row_cells: std.BoundedArray([]const u8, max_table_columns), ) ?std.BoundedArray(Node.TableCellAlignment, max_table_columns) { var alignments: std.BoundedArray(Node.TableCellAlignment, max_table_columns) = .{}; for (row_cells.slice()) |content| { const alignment = parseTableHeaderDelimiterCell(content) orelse return null; alignments.appendAssumeCapacity(alignment); } return alignments; } fn parseTableHeaderDelimiterCell(content: []const u8) ?Node.TableCellAlignment { var state: enum { before_rule, after_left_anchor, in_rule, after_right_anchor, after_rule, } = .before_rule; var left_anchor = false; var right_anchor = false; for (content) |c| { switch (state) { .before_rule => switch (c) { ' ' => {}, ':' => { left_anchor = true; state = .after_left_anchor; }, '-' => state = .in_rule, else => return null, }, .after_left_anchor => switch (c) { '-' => state = .in_rule, else => return null, }, .in_rule => switch (c) { '-' => {}, ':' => { right_anchor = true; state = .after_right_anchor; }, ' ' => state = .after_rule, else => return null, }, .after_right_anchor => switch (c) { ' ' => state = .after_rule, else => return null, }, .after_rule => switch (c) { ' ' => {}, else => return null, }, } } switch (state) { .before_rule, .after_left_anchor, => return null, .in_rule, .after_right_anchor, .after_rule, => {}, } return if (left_anchor and right_anchor) .center else if (left_anchor) .left else if (right_anchor) .right else .unset; } test parseTableHeaderDelimiterCell { try expectEqual(null, parseTableHeaderDelimiterCell("")); try expectEqual(null, parseTableHeaderDelimiterCell(" ")); try expectEqual(.unset, parseTableHeaderDelimiterCell("-")); try expectEqual(.unset, parseTableHeaderDelimiterCell(" - ")); try expectEqual(.unset, parseTableHeaderDelimiterCell("----")); try expectEqual(.unset, parseTableHeaderDelimiterCell(" ---- ")); try expectEqual(null, parseTableHeaderDelimiterCell(":")); try expectEqual(null, parseTableHeaderDelimiterCell("::")); try expectEqual(.left, parseTableHeaderDelimiterCell(":-")); try expectEqual(.left, parseTableHeaderDelimiterCell(" :----")); try expectEqual(.center, parseTableHeaderDelimiterCell(":-:")); try expectEqual(.center, parseTableHeaderDelimiterCell(":----:")); try expectEqual(.center, parseTableHeaderDelimiterCell(" :----: ")); try expectEqual(.right, parseTableHeaderDelimiterCell("-:")); try expectEqual(.right, parseTableHeaderDelimiterCell("----:")); try expectEqual(.right, parseTableHeaderDelimiterCell(" ----: ")); } const HeadingStart = struct { level: u3, rest: []const u8, }; fn startHeading(unindented_line: []const u8) ?HeadingStart { var level: u3 = 0; return for (unindented_line, 0..) |c, i| { switch (c) { '#' => { if (level == 6) break null; level += 1; }, ' ' => { // We must have seen at least one # by this point, since // unindented_line has no leading spaces. assert(level > 0); break .{ .level = level, .rest = unindented_line[i + 1 ..], }; }, else => break null, } } else null; } const CodeBlockStart = struct { tag: StringIndex, fence_len: usize, }; fn startCodeBlock(p: *Parser, unindented_line: []const u8) !?CodeBlockStart { var fence_len: usize = 0; const tag_bytes = for (unindented_line, 0..) |c, i| { switch (c) { '`' => fence_len += 1, else => break unindented_line[i..], } } else ""; // Code block tags may not contain backticks, since that would create // potential confusion with inline code spans. if (fence_len < 3 or mem.indexOfScalar(u8, tag_bytes, '`') != null) return null; return .{ .tag = try p.addString(mem.trim(u8, tag_bytes, " ")), .fence_len = fence_len, }; } fn startBlockquote(unindented_line: []const u8) ?[]const u8 { return if (mem.startsWith(u8, unindented_line, ">")) unindented_line[1..] else null; } fn isThematicBreak(line: []const u8) bool { var char: ?u8 = null; var count: usize = 0; for (line) |c| { switch (c) { ' ' => {}, '-', '_', '*' => { if (char != null and c != char.?) return false; char = c; count += 1; }, else => return false, } } return count >= 3; } fn closeLastBlock(p: *Parser) !void { const b = p.pending_blocks.pop().?; const node = switch (b.tag) { .list => list: { assert(b.string_start == p.scratch_string.items.len); // Although tightness is parsed as a property of the list, it is // stored at the list item level to make it possible to render each // node without any context from its parents. const list_items = p.scratch_extra.items[b.extra_start..]; const node_datas = p.nodes.items(.data); if (!b.data.list.tight) { for (list_items) |list_item| { node_datas[list_item].list_item.tight = false; } } const children = try p.addExtraChildren(@ptrCast(list_items)); break :list try p.addNode(.{ .tag = .list, .data = .{ .list = .{ .start = switch (b.data.list.marker) { .number_dot, .number_paren => @enumFromInt(b.data.list.start), .@"-", .@"*", .@"+" => .unordered, }, .children = children, } }, }); }, .list_item => list_item: { assert(b.string_start == p.scratch_string.items.len); const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); break :list_item try p.addNode(.{ .tag = .list_item, .data = .{ .list_item = .{ .tight = true, .children = children, } }, }); }, .table => table: { assert(b.string_start == p.scratch_string.items.len); const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); break :table try p.addNode(.{ .tag = .table, .data = .{ .container = .{ .children = children, } }, }); }, .table_row => table_row: { assert(b.string_start == p.scratch_string.items.len); const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); break :table_row try p.addNode(.{ .tag = .table_row, .data = .{ .container = .{ .children = children, } }, }); }, .heading => heading: { const children = try p.parseInlines(p.scratch_string.items[b.string_start..]); break :heading try p.addNode(.{ .tag = .heading, .data = .{ .heading = .{ .level = b.data.heading.level, .children = children, } }, }); }, .code_block => code_block: { const content = try p.addString(p.scratch_string.items[b.string_start..]); break :code_block try p.addNode(.{ .tag = .code_block, .data = .{ .code_block = .{ .tag = b.data.code_block.tag, .content = content, } }, }); }, .blockquote => blockquote: { assert(b.string_start == p.scratch_string.items.len); const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..])); break :blockquote try p.addNode(.{ .tag = .blockquote, .data = .{ .container = .{ .children = children, } }, }); }, .paragraph => paragraph: { const children = try p.parseInlines(p.scratch_string.items[b.string_start..]); break :paragraph try p.addNode(.{ .tag = .paragraph, .data = .{ .container = .{ .children = children, } }, }); }, .thematic_break => try p.addNode(.{ .tag = .thematic_break, .data = .{ .none = {} }, }), }; p.scratch_string.items.len = b.string_start; p.scratch_extra.items.len = b.extra_start; try p.addScratchExtraNode(node); } const InlineParser = struct { parent: *Parser, content: []const u8, pos: usize = 0, pending_inlines: std.ArrayListUnmanaged(PendingInline) = .empty, completed_inlines: std.ArrayListUnmanaged(CompletedInline) = .empty, const PendingInline = struct { tag: Tag, data: Data, start: usize, const Tag = enum { /// Data is `emphasis`. emphasis, /// Data is `none`. link, /// Data is `none`. image, }; const Data = union { none: void, emphasis: struct { underscore: bool, run_len: usize, }, }; }; const CompletedInline = struct { node: Node.Index, start: usize, len: usize, }; fn deinit(ip: *InlineParser) void { ip.pending_inlines.deinit(ip.parent.allocator); ip.completed_inlines.deinit(ip.parent.allocator); } /// Parses all of `ip.content`, returning the children of the node /// containing the inline content. fn parse(ip: *InlineParser) Allocator.Error!ExtraIndex { while (ip.pos < ip.content.len) : (ip.pos += 1) { switch (ip.content[ip.pos]) { '\\' => ip.pos += 1, '[' => try ip.pending_inlines.append(ip.parent.allocator, .{ .tag = .link, .data = .{ .none = {} }, .start = ip.pos, }), '!' => if (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == '[') { try ip.pending_inlines.append(ip.parent.allocator, .{ .tag = .image, .data = .{ .none = {} }, .start = ip.pos, }); ip.pos += 1; }, ']' => try ip.parseLink(), '<' => try ip.parseAutolink(), '*', '_' => try ip.parseEmphasis(), '`' => try ip.parseCodeSpan(), 'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) { try ip.parseTextAutolink(); }, else => {}, } } const children = try ip.encodeChildren(0, ip.content.len); // There may be pending inlines after parsing (e.g. unclosed emphasis // runs), but there must not be any completed inlines, since those // should all be part of `children`. assert(ip.completed_inlines.items.len == 0); return children; } /// Parses a link, starting at the `]` at the end of the link text. `ip.pos` /// is left at the closing `)` of the link target or at the closing `]` if /// there is none. fn parseLink(ip: *InlineParser) !void { var i = ip.pending_inlines.items.len; while (i > 0) { i -= 1; if (ip.pending_inlines.items[i].tag == .link or ip.pending_inlines.items[i].tag == .image) break; } else return; const opener = ip.pending_inlines.items[i]; ip.pending_inlines.shrinkRetainingCapacity(i); const text_start = switch (opener.tag) { .link => opener.start + 1, .image => opener.start + 2, else => unreachable, }; if (ip.pos + 1 >= ip.content.len or ip.content[ip.pos + 1] != '(') return; const text_end = ip.pos; const target_start = text_end + 2; var target_end = target_start; var nesting_level: usize = 1; while (target_end < ip.content.len) : (target_end += 1) { switch (ip.content[target_end]) { '\\' => target_end += 1, '(' => nesting_level += 1, ')' => { if (nesting_level == 1) break; nesting_level -= 1; }, else => {}, } } else return; ip.pos = target_end; const children = try ip.encodeChildren(text_start, text_end); const target = try ip.encodeLinkTarget(target_start, target_end); const link = try ip.parent.addNode(.{ .tag = switch (opener.tag) { .link => .link, .image => .image, else => unreachable, }, .data = .{ .link = .{ .target = target, .children = children, } }, }); try ip.completed_inlines.append(ip.parent.allocator, .{ .node = link, .start = opener.start, .len = ip.pos - opener.start + 1, }); } fn encodeLinkTarget(ip: *InlineParser, start: usize, end: usize) !StringIndex { // For efficiency, we can encode directly into string_bytes rather than // creating a temporary string and then encoding it, since this process // is entirely linear. const string_top = ip.parent.string_bytes.items.len; errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top); var text_iter: TextIterator = .{ .content = ip.content[start..end] }; while (text_iter.next()) |content| { switch (content) { .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c), .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s), .line_break => try ip.parent.string_bytes.appendSlice(ip.parent.allocator, "\\\n"), } } try ip.parent.string_bytes.append(ip.parent.allocator, 0); return @enumFromInt(string_top); } /// Parses an autolink, starting at the opening `<`. `ip.pos` is left at the /// closing `>`, or remains unchanged at the opening `<` if there is none. fn parseAutolink(ip: *InlineParser) !void { const start = ip.pos; ip.pos += 1; var state: enum { start, scheme, target, } = .start; while (ip.pos < ip.content.len) : (ip.pos += 1) { switch (state) { .start => switch (ip.content[ip.pos]) { 'A'...'Z', 'a'...'z' => state = .scheme, else => break, }, .scheme => switch (ip.content[ip.pos]) { 'A'...'Z', 'a'...'z', '0'...'9', '+', '.', '-' => {}, ':' => state = .target, else => break, }, .target => switch (ip.content[ip.pos]) { '<', ' ', '\t', '\n' => break, // Not allowed in autolinks '>' => { // Backslash escapes are not recognized in autolink targets. const target = try ip.parent.addString(ip.content[start + 1 .. ip.pos]); const node = try ip.parent.addNode(.{ .tag = .autolink, .data = .{ .text = .{ .content = target, } }, }); try ip.completed_inlines.append(ip.parent.allocator, .{ .node = node, .start = start, .len = ip.pos - start + 1, }); return; }, else => {}, }, } } ip.pos = start; } /// Parses a plain text autolink (not delimited by `<>`), starting at the /// first character in the link (an `h`). `ip.pos` is left at the last /// character of the link, or remains unchanged if there is no valid link. fn parseTextAutolink(ip: *InlineParser) !void { const start = ip.pos; var state: union(enum) { /// Inside `http`. Contains the rest of the text to be matched. http: []const u8, after_http, after_https, /// Inside `://`. Contains the rest of the text to be matched. authority: []const u8, /// Inside link content. content: struct { start: usize, paren_nesting: usize, }, } = .{ .http = "http" }; while (ip.pos < ip.content.len) : (ip.pos += 1) { switch (state) { .http => |rest| { if (ip.content[ip.pos] != rest[0]) break; if (rest.len > 1) { state = .{ .http = rest[1..] }; } else { state = .after_http; } }, .after_http => switch (ip.content[ip.pos]) { 's' => state = .after_https, ':' => state = .{ .authority = "//" }, else => break, }, .after_https => switch (ip.content[ip.pos]) { ':' => state = .{ .authority = "//" }, else => break, }, .authority => |rest| { if (ip.content[ip.pos] != rest[0]) break; if (rest.len > 1) { state = .{ .authority = rest[1..] }; } else { state = .{ .content = .{ .start = ip.pos + 1, .paren_nesting = 0, } }; } }, .content => |*content| switch (ip.content[ip.pos]) { ' ', '\t', '\n' => break, '(' => content.paren_nesting += 1, ')' => if (content.paren_nesting == 0) { break; } else { content.paren_nesting -= 1; }, else => {}, }, } } switch (state) { .http, .after_http, .after_https, .authority => { ip.pos = start; }, .content => |content| { while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) { ip.pos -= 1; } if (ip.pos == content.start) { ip.pos = start; return; } const target = try ip.parent.addString(ip.content[start..ip.pos]); const node = try ip.parent.addNode(.{ .tag = .autolink, .data = .{ .text = .{ .content = target, } }, }); try ip.completed_inlines.append(ip.parent.allocator, .{ .node = node, .start = start, .len = ip.pos - start, }); ip.pos -= 1; }, } } /// Returns whether `c` may appear before a text autolink is recognized. fn isPreTextAutolink(c: u8) bool { return switch (c) { ' ', '\t', '\n', '*', '_', '(' => true, else => false, }; } /// Returns whether `c` is punctuation that may appear after a text autolink /// and not be considered part of it. fn isPostTextAutolink(c: u8) bool { return switch (c) { '?', '!', '.', ',', ':', '*', '_' => true, else => false, }; } /// Parses emphasis, starting at the beginning of a run of `*` or `_` /// characters. `ip.pos` is left at the last character in the run after /// parsing. fn parseEmphasis(ip: *InlineParser) !void { const char = ip.content[ip.pos]; var start = ip.pos; while (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == char) { ip.pos += 1; } var len = ip.pos - start + 1; const underscore = char == '_'; const space_before = start == 0 or isWhitespace(ip.content[start - 1]); const space_after = start + len == ip.content.len or isWhitespace(ip.content[start + len]); const punct_before = start == 0 or isPunctuation(ip.content[start - 1]); const punct_after = start + len == ip.content.len or isPunctuation(ip.content[start + len]); // The rules for when emphasis may be closed or opened are stricter for // underscores to avoid inappropriately interpreting snake_case words as // containing emphasis markers. const can_open = if (underscore) !space_after and (space_before or punct_before) else !space_after; const can_close = if (underscore) !space_before and (space_after or punct_after) else !space_before; if (can_close and ip.pending_inlines.items.len > 0) { var i = ip.pending_inlines.items.len; while (i > 0 and len > 0) { i -= 1; const opener = &ip.pending_inlines.items[i]; if (opener.tag != .emphasis or opener.data.emphasis.underscore != underscore) continue; const close_len = @min(opener.data.emphasis.run_len, len); const opener_end = opener.start + opener.data.emphasis.run_len; const emphasis = try ip.encodeEmphasis(opener_end, start, close_len); const emphasis_start = opener_end - close_len; const emphasis_len = start - emphasis_start + close_len; try ip.completed_inlines.append(ip.parent.allocator, .{ .node = emphasis, .start = emphasis_start, .len = emphasis_len, }); // There may still be other openers further down in the // stack to close, or part of this run might serve as an // opener itself. start += close_len; len -= close_len; // Remove any pending inlines above this on the stack, since // closing this emphasis will prevent them from being closed. // Additionally, if this opener is completely consumed by // being closed, it can be removed. opener.data.emphasis.run_len -= close_len; if (opener.data.emphasis.run_len == 0) { ip.pending_inlines.shrinkRetainingCapacity(i); } else { ip.pending_inlines.shrinkRetainingCapacity(i + 1); } } } if (can_open and len > 0) { try ip.pending_inlines.append(ip.parent.allocator, .{ .tag = .emphasis, .data = .{ .emphasis = .{ .underscore = underscore, .run_len = len, } }, .start = start, }); } } /// Encodes emphasis specified by a run of `run_len` emphasis characters, /// with `start..end` being the range of content contained within the /// emphasis. fn encodeEmphasis(ip: *InlineParser, start: usize, end: usize, run_len: usize) !Node.Index { const children = try ip.encodeChildren(start, end); var inner = switch (run_len % 3) { 1 => try ip.parent.addNode(.{ .tag = .emphasis, .data = .{ .container = .{ .children = children, } }, }), 2 => try ip.parent.addNode(.{ .tag = .strong, .data = .{ .container = .{ .children = children, } }, }), 0 => strong_emphasis: { const strong = try ip.parent.addNode(.{ .tag = .strong, .data = .{ .container = .{ .children = children, } }, }); break :strong_emphasis try ip.parent.addNode(.{ .tag = .emphasis, .data = .{ .container = .{ .children = try ip.parent.addExtraChildren(&.{strong}), } }, }); }, else => unreachable, }; var run_left = run_len; while (run_left > 3) : (run_left -= 3) { const strong = try ip.parent.addNode(.{ .tag = .strong, .data = .{ .container = .{ .children = try ip.parent.addExtraChildren(&.{inner}), } }, }); inner = try ip.parent.addNode(.{ .tag = .emphasis, .data = .{ .container = .{ .children = try ip.parent.addExtraChildren(&.{strong}), } }, }); } return inner; } /// Parses a code span, starting at the beginning of the opening backtick /// run. `ip.pos` is left at the last character in the closing run after /// parsing. fn parseCodeSpan(ip: *InlineParser) !void { const opener_start = ip.pos; ip.pos = mem.indexOfNonePos(u8, ip.content, ip.pos, "`") orelse ip.content.len; const opener_len = ip.pos - opener_start; const start = ip.pos; const end = while (mem.indexOfScalarPos(u8, ip.content, ip.pos, '`')) |closer_start| { ip.pos = mem.indexOfNonePos(u8, ip.content, closer_start, "`") orelse ip.content.len; const closer_len = ip.pos - closer_start; if (closer_len == opener_len) break closer_start; } else unterminated: { ip.pos = ip.content.len; break :unterminated ip.content.len; }; var content = if (start < ip.content.len) ip.content[start..end] else ""; // This single space removal rule allows code spans to be written which // start or end with backticks. if (mem.startsWith(u8, content, " `")) content = content[1..]; if (mem.endsWith(u8, content, "` ")) content = content[0 .. content.len - 1]; const text = try ip.parent.addNode(.{ .tag = .code_span, .data = .{ .text = .{ .content = try ip.parent.addString(content), } }, }); try ip.completed_inlines.append(ip.parent.allocator, .{ .node = text, .start = opener_start, .len = ip.pos - opener_start, }); // Ensure ip.pos is pointing at the last character of the // closer, not after it. ip.pos -= 1; } /// Encodes children parsed in the content range `start..end`. The children /// will be text nodes and any completed inlines within the range. fn encodeChildren(ip: *InlineParser, start: usize, end: usize) !ExtraIndex { const scratch_extra_top = ip.parent.scratch_extra.items.len; defer ip.parent.scratch_extra.shrinkRetainingCapacity(scratch_extra_top); var child_index = ip.completed_inlines.items.len; while (child_index > 0 and ip.completed_inlines.items[child_index - 1].start >= start) { child_index -= 1; } const start_child_index = child_index; var pos = start; while (child_index < ip.completed_inlines.items.len) : (child_index += 1) { const child_inline = ip.completed_inlines.items[child_index]; // Completed inlines must be strictly nested within the encodable // content. assert(child_inline.start >= pos and child_inline.start + child_inline.len <= end); if (child_inline.start > pos) { try ip.encodeTextNode(pos, child_inline.start); } try ip.parent.addScratchExtraNode(child_inline.node); pos = child_inline.start + child_inline.len; } ip.completed_inlines.shrinkRetainingCapacity(start_child_index); if (pos < end) { try ip.encodeTextNode(pos, end); } const children = ip.parent.scratch_extra.items[scratch_extra_top..]; return try ip.parent.addExtraChildren(@ptrCast(children)); } /// Encodes textual content `ip.content[start..end]` to `scratch_extra`. The /// encoded content may include both `text` and `line_break` nodes. fn encodeTextNode(ip: *InlineParser, start: usize, end: usize) !void { // For efficiency, we can encode directly into string_bytes rather than // creating a temporary string and then encoding it, since this process // is entirely linear. const string_top = ip.parent.string_bytes.items.len; errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top); var string_start = string_top; var text_iter: TextIterator = .{ .content = ip.content[start..end] }; while (text_iter.next()) |content| { switch (content) { .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c), .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s), .line_break => { if (ip.parent.string_bytes.items.len > string_start) { try ip.parent.string_bytes.append(ip.parent.allocator, 0); try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ .tag = .text, .data = .{ .text = .{ .content = @enumFromInt(string_start), } }, })); string_start = ip.parent.string_bytes.items.len; } try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ .tag = .line_break, .data = .{ .none = {} }, })); }, } } if (ip.parent.string_bytes.items.len > string_start) { try ip.parent.string_bytes.append(ip.parent.allocator, 0); try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{ .tag = .text, .data = .{ .text = .{ .content = @enumFromInt(string_start), } }, })); } } /// An iterator over parts of textual content, handling unescaping of /// escaped characters and line breaks. const TextIterator = struct { content: []const u8, pos: usize = 0, const Content = union(enum) { char: u8, text: []const u8, line_break, }; const replacement = "\u{FFFD}"; fn next(iter: *TextIterator) ?Content { if (iter.pos >= iter.content.len) return null; if (iter.content[iter.pos] == '\\') { iter.pos += 1; if (iter.pos == iter.content.len) { return .{ .char = '\\' }; } else if (iter.content[iter.pos] == '\n') { iter.pos += 1; return .line_break; } else if (isPunctuation(iter.content[iter.pos])) { const c = iter.content[iter.pos]; iter.pos += 1; return .{ .char = c }; } else { return .{ .char = '\\' }; } } return iter.nextCodepoint(); } fn nextCodepoint(iter: *TextIterator) ?Content { switch (iter.content[iter.pos]) { 0 => { iter.pos += 1; return .{ .text = replacement }; }, 1...127 => |c| { iter.pos += 1; return .{ .char = c }; }, else => |b| { const cp_len = std.unicode.utf8ByteSequenceLength(b) catch { iter.pos += 1; return .{ .text = replacement }; }; const is_valid = iter.pos + cp_len <= iter.content.len and std.unicode.utf8ValidateSlice(iter.content[iter.pos..][0..cp_len]); const cp_encoded = if (is_valid) iter.content[iter.pos..][0..cp_len] else replacement; iter.pos += cp_len; return .{ .text = cp_encoded }; }, } } }; }; fn parseInlines(p: *Parser, content: []const u8) !ExtraIndex { var ip: InlineParser = .{ .parent = p, .content = mem.trim(u8, content, " \t\n"), }; defer ip.deinit(); return try ip.parse(); } pub fn extraData(p: Parser, comptime T: type, index: ExtraIndex) ExtraData(T) { const fields = @typeInfo(T).@"struct".fields; var i: usize = @intFromEnum(index); var result: T = undefined; inline for (fields) |field| { @field(result, field.name) = switch (field.type) { u32 => p.extra.items[i], else => @compileError("bad field type"), }; i += 1; } return .{ .data = result, .end = i }; } pub fn extraChildren(p: Parser, index: ExtraIndex) []const Node.Index { const children = p.extraData(Node.Children, index); return @ptrCast(p.extra.items[children.end..][0..children.data.len]); } fn addNode(p: *Parser, node: Node) !Node.Index { const index: Node.Index = @enumFromInt(@as(u32, @intCast(p.nodes.len))); try p.nodes.append(p.allocator, node); return index; } fn addString(p: *Parser, s: []const u8) !StringIndex { if (s.len == 0) return .empty; const index: StringIndex = @enumFromInt(@as(u32, @intCast(p.string_bytes.items.len))); try p.string_bytes.ensureUnusedCapacity(p.allocator, s.len + 1); p.string_bytes.appendSliceAssumeCapacity(s); p.string_bytes.appendAssumeCapacity(0); return index; } fn addExtraChildren(p: *Parser, nodes: []const Node.Index) !ExtraIndex { const index: ExtraIndex = @enumFromInt(@as(u32, @intCast(p.extra.items.len))); try p.extra.ensureUnusedCapacity(p.allocator, nodes.len + 1); p.extra.appendAssumeCapacity(@intCast(nodes.len)); p.extra.appendSliceAssumeCapacity(@ptrCast(nodes)); return index; } fn addScratchExtraNode(p: *Parser, node: Node.Index) !void { try p.scratch_extra.append(p.allocator, @intFromEnum(node)); } fn addScratchStringLine(p: *Parser, line: []const u8) !void { try p.scratch_string.ensureUnusedCapacity(p.allocator, line.len + 1); p.scratch_string.appendSliceAssumeCapacity(line); p.scratch_string.appendAssumeCapacity('\n'); } fn isBlank(line: []const u8) bool { return mem.indexOfNone(u8, line, " \t") == null; } fn isPunctuation(c: u8) bool { return switch (c) { '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', => true, else => false, }; }