Merge pull request #14434 from FnControlOption/xz

Add xz decoder closes #14300 closes #2851
2025-12-06 05:44:20 +00:00 · 2023-01-25 20:31:55 -05:00 · 2023-01-25 20:31:55 -05:00 · 96a55f6ce8
commit 96a55f6ce8
parent fcef728b9b d0dedefde9
32 changed files with 1230 additions and 16 deletions
--- a/build.zig
+++ b/build.zig
@ -122,6 +122,8 @@ pub fn build(b: *Builder) !void {
                "compress-gettysburg.txt",
                "compress-pi.txt",
                "rfc1951.txt",
+                // exclude files from lib/std/compress/xz/testdata
+                ".xz",
                // exclude files from lib/std/tz/
                ".tzif",
                // others
--- a/lib/std/compress.zig
+++ b/lib/std/compress.zig
@ -3,6 +3,7 @@ const std = @import("std.zig");
 pub const deflate = @import("compress/deflate.zig");
 pub const gzip = @import("compress/gzip.zig");
 pub const zlib = @import("compress/zlib.zig");
+pub const xz = @import("compress/xz.zig");

 pub fn HashedReader(
    comptime ReaderType: anytype,
@ -38,4 +39,5 @@ test {
    _ = deflate;
    _ = gzip;
    _ = zlib;
+    _ = xz;
 }
--- a/lib/std/compress/gzip.zig
+++ b/lib/std/compress/gzip.zig
@ -1,7 +1,7 @@
 //
 // Decompressor for GZIP data streams (RFC1952)

-const std = @import("std");
+const std = @import("../std.zig");
 const io = std.io;
 const fs = std.fs;
 const testing = std.testing;
@ -17,10 +17,7 @@ const FCOMMENT = 1 << 4;

 const max_string_len = 1024;

-/// TODO: the fully qualified namespace to this declaration is
-/// std.compress.gzip.GzipStream which has a redundant "gzip" in the name.
-/// Instead, it should be `std.compress.gzip.Stream`.
-pub fn GzipStream(comptime ReaderType: type) type {
+pub fn Decompress(comptime ReaderType: type) type {
    return struct {
        const Self = @This();

@ -154,14 +151,14 @@ pub fn GzipStream(comptime ReaderType: type) type {
    };
 }

-pub fn gzipStream(allocator: mem.Allocator, reader: anytype) !GzipStream(@TypeOf(reader)) {
-    return GzipStream(@TypeOf(reader)).init(allocator, reader);
+pub fn decompress(allocator: mem.Allocator, reader: anytype) !Decompress(@TypeOf(reader)) {
+    return Decompress(@TypeOf(reader)).init(allocator, reader);
 }

 fn testReader(data: []const u8, comptime expected: []const u8) !void {
    var in_stream = io.fixedBufferStream(data);

-    var gzip_stream = try gzipStream(testing.allocator, in_stream.reader());
+    var gzip_stream = try decompress(testing.allocator, in_stream.reader());
    defer gzip_stream.deinit();

    // Read and decompress the whole file
--- a/lib/std/compress/xz.zig
+++ b/lib/std/compress/xz.zig
@ -0,0 +1,145 @@
+const std = @import("std");
+const block = @import("xz/block.zig");
+const Allocator = std.mem.Allocator;
+const Crc32 = std.hash.Crc32;
+
+pub const Check = enum(u4) {
+    none = 0x00,
+    crc32 = 0x01,
+    crc64 = 0x04,
+    sha256 = 0x0A,
+    _,
+};
+
+fn readStreamFlags(reader: anytype, check: *Check) !void {
+    var bit_reader = std.io.bitReader(.Little, reader);
+
+    const reserved1 = try bit_reader.readBitsNoEof(u8, 8);
+    if (reserved1 != 0)
+        return error.CorruptInput;
+
+    check.* = @intToEnum(Check, try bit_reader.readBitsNoEof(u4, 4));
+
+    const reserved2 = try bit_reader.readBitsNoEof(u4, 4);
+    if (reserved2 != 0)
+        return error.CorruptInput;
+}
+
+pub fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader)) {
+    return Decompress(@TypeOf(reader)).init(allocator, reader);
+}
+
+pub fn Decompress(comptime ReaderType: type) type {
+    return struct {
+        const Self = @This();
+
+        pub const Error = ReaderType.Error || block.Decoder(ReaderType).Error;
+        pub const Reader = std.io.Reader(*Self, Error, read);
+
+        allocator: Allocator,
+        block_decoder: block.Decoder(ReaderType),
+        in_reader: ReaderType,
+
+        fn init(allocator: Allocator, source: ReaderType) !Self {
+            const magic = try source.readBytesNoEof(6);
+            if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
+                return error.BadHeader;
+
+            var check: Check = undefined;
+            const hash_a = blk: {
+                var hasher = std.compress.hashedReader(source, Crc32.init());
+                try readStreamFlags(hasher.reader(), &check);
+                break :blk hasher.hasher.final();
+            };
+
+            const hash_b = try source.readIntLittle(u32);
+            if (hash_a != hash_b)
+                return error.WrongChecksum;
+
+            return Self{
+                .allocator = allocator,
+                .block_decoder = try block.decoder(allocator, source, check),
+                .in_reader = source,
+            };
+        }
+
+        pub fn deinit(self: *Self) void {
+            self.block_decoder.deinit();
+        }
+
+        pub fn reader(self: *Self) Reader {
+            return .{ .context = self };
+        }
+
+        pub fn read(self: *Self, buffer: []u8) Error!usize {
+            if (buffer.len == 0)
+                return 0;
+
+            const r = try self.block_decoder.read(buffer);
+            if (r != 0)
+                return r;
+
+            const index_size = blk: {
+                var hasher = std.compress.hashedReader(self.in_reader, Crc32.init());
+                hasher.hasher.update(&[1]u8{0x00});
+
+                var counter = std.io.countingReader(hasher.reader());
+                counter.bytes_read += 1;
+
+                const counting_reader = counter.reader();
+
+                const record_count = try std.leb.readULEB128(u64, counting_reader);
+                if (record_count != self.block_decoder.block_count)
+                    return error.CorruptInput;
+
+                var i: usize = 0;
+                while (i < record_count) : (i += 1) {
+                    // TODO: validate records
+                    _ = try std.leb.readULEB128(u64, counting_reader);
+                    _ = try std.leb.readULEB128(u64, counting_reader);
+                }
+
+                while (counter.bytes_read % 4 != 0) {
+                    if (try counting_reader.readByte() != 0)
+                        return error.CorruptInput;
+                }
+
+                const hash_a = hasher.hasher.final();
+                const hash_b = try counting_reader.readIntLittle(u32);
+                if (hash_a != hash_b)
+                    return error.WrongChecksum;
+
+                break :blk counter.bytes_read;
+            };
+
+            const hash_a = try self.in_reader.readIntLittle(u32);
+
+            const hash_b = blk: {
+                var hasher = std.compress.hashedReader(self.in_reader, Crc32.init());
+                const hashed_reader = hasher.reader();
+
+                const backward_size = (try hashed_reader.readIntLittle(u32) + 1) * 4;
+                if (backward_size != index_size)
+                    return error.CorruptInput;
+
+                var check: Check = undefined;
+                try readStreamFlags(hashed_reader, &check);
+
+                break :blk hasher.hasher.final();
+            };
+
+            if (hash_a != hash_b)
+                return error.WrongChecksum;
+
+            const magic = try self.in_reader.readBytesNoEof(2);
+            if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' }))
+                return error.CorruptInput;
+
+            return 0;
+        }
+    };
+}
+
+test {
+    _ = @import("xz/test.zig");
+}
--- a/lib/std/compress/xz/block.zig
+++ b/lib/std/compress/xz/block.zig
@ -0,0 +1,317 @@
+const std = @import("../../std.zig");
+const lzma = @import("lzma.zig");
+const Allocator = std.mem.Allocator;
+const Crc32 = std.hash.Crc32;
+const Crc64 = std.hash.crc.Crc64Xz;
+const Sha256 = std.crypto.hash.sha2.Sha256;
+const xz = std.compress.xz;
+
+const DecodeError = error{
+    CorruptInput,
+    EndOfStream,
+    EndOfStreamWithNoError,
+    WrongChecksum,
+    Unsupported,
+    Overflow,
+};
+
+pub fn decoder(allocator: Allocator, reader: anytype, check: xz.Check) !Decoder(@TypeOf(reader)) {
+    return Decoder(@TypeOf(reader)).init(allocator, reader, check);
+}
+
+pub fn Decoder(comptime ReaderType: type) type {
+    return struct {
+        const Self = @This();
+        pub const Error =
+            ReaderType.Error ||
+            DecodeError ||
+            Allocator.Error;
+        pub const Reader = std.io.Reader(*Self, Error, read);
+
+        allocator: Allocator,
+        inner_reader: ReaderType,
+        check: xz.Check,
+        err: ?Error,
+        accum: lzma.LzAccumBuffer,
+        lzma_state: lzma.DecoderState,
+        block_count: usize,
+
+        fn init(allocator: Allocator, in_reader: ReaderType, check: xz.Check) !Self {
+            return Self{
+                .allocator = allocator,
+                .inner_reader = in_reader,
+                .check = check,
+                .err = null,
+                .accum = .{},
+                .lzma_state = try lzma.DecoderState.init(allocator),
+                .block_count = 0,
+            };
+        }
+
+        pub fn deinit(self: *Self) void {
+            self.accum.deinit(self.allocator);
+            self.lzma_state.deinit(self.allocator);
+        }
+
+        pub fn reader(self: *Self) Reader {
+            return .{ .context = self };
+        }
+
+        pub fn read(self: *Self, output: []u8) Error!usize {
+            while (true) {
+                if (self.accum.to_read.items.len > 0) {
+                    const n = self.accum.read(output);
+                    if (self.accum.to_read.items.len == 0 and self.err != null) {
+                        if (self.err.? == DecodeError.EndOfStreamWithNoError) {
+                            return n;
+                        }
+                        return self.err.?;
+                    }
+                    return n;
+                }
+                if (self.err != null) {
+                    if (self.err.? == DecodeError.EndOfStreamWithNoError) {
+                        return 0;
+                    }
+                    return self.err.?;
+                }
+                self.readBlock() catch |e| {
+                    self.err = e;
+                    if (self.accum.to_read.items.len == 0) {
+                        try self.accum.reset(self.allocator);
+                    }
+                };
+            }
+        }
+
+        fn readBlock(self: *Self) Error!void {
+            const unpacked_pos = self.accum.to_read.items.len;
+
+            var block_counter = std.io.countingReader(self.inner_reader);
+            const block_reader = block_counter.reader();
+
+            var packed_size: ?u64 = null;
+            var unpacked_size: ?u64 = null;
+
+            // Block Header
+            {
+                var header_hasher = std.compress.hashedReader(block_reader, Crc32.init());
+                const header_reader = header_hasher.reader();
+
+                const header_size = try header_reader.readByte() * 4;
+                if (header_size == 0)
+                    return error.EndOfStreamWithNoError;
+
+                const Flags = packed struct(u8) {
+                    last_filter_index: u2,
+                    reserved: u4,
+                    has_packed_size: bool,
+                    has_unpacked_size: bool,
+                };
+
+                const flags = @bitCast(Flags, try header_reader.readByte());
+                const filter_count = @as(u3, flags.last_filter_index) + 1;
+                if (filter_count > 1)
+                    return error.Unsupported;
+
+                if (flags.has_packed_size)
+                    packed_size = try std.leb.readULEB128(u64, header_reader);
+
+                if (flags.has_unpacked_size)
+                    unpacked_size = try std.leb.readULEB128(u64, header_reader);
+
+                const FilterId = enum(u64) {
+                    lzma2 = 0x21,
+                    _,
+                };
+
+                const filter_id = @intToEnum(
+                    FilterId,
+                    try std.leb.readULEB128(u64, header_reader),
+                );
+
+                if (@enumToInt(filter_id) >= 0x4000_0000_0000_0000)
+                    return error.CorruptInput;
+
+                if (filter_id != .lzma2)
+                    return error.Unsupported;
+
+                const properties_size = try std.leb.readULEB128(u64, header_reader);
+                if (properties_size != 1)
+                    return error.CorruptInput;
+
+                // TODO: use filter properties
+                _ = try header_reader.readByte();
+
+                while (block_counter.bytes_read != header_size) {
+                    if (try header_reader.readByte() != 0)
+                        return error.CorruptInput;
+                }
+
+                const hash_a = header_hasher.hasher.final();
+                const hash_b = try header_reader.readIntLittle(u32);
+                if (hash_a != hash_b)
+                    return error.WrongChecksum;
+            }
+
+            // Compressed Data
+            var packed_counter = std.io.countingReader(block_reader);
+            const packed_reader = packed_counter.reader();
+            while (try self.readLzma2Chunk(packed_reader)) {}
+
+            if (packed_size) |s| {
+                if (s != packed_counter.bytes_read)
+                    return error.CorruptInput;
+            }
+
+            const unpacked_bytes = self.accum.to_read.items[unpacked_pos..];
+            if (unpacked_size) |s| {
+                if (s != unpacked_bytes.len)
+                    return error.CorruptInput;
+            }
+
+            // Block Padding
+            while (block_counter.bytes_read % 4 != 0) {
+                if (try block_reader.readByte() != 0)
+                    return error.CorruptInput;
+            }
+
+            switch (self.check) {
+                .none => {},
+                .crc32 => {
+                    const hash_a = Crc32.hash(unpacked_bytes);
+                    const hash_b = try self.inner_reader.readIntLittle(u32);
+                    if (hash_a != hash_b)
+                        return error.WrongChecksum;
+                },
+                .crc64 => {
+                    const hash_a = Crc64.hash(unpacked_bytes);
+                    const hash_b = try self.inner_reader.readIntLittle(u64);
+                    if (hash_a != hash_b)
+                        return error.WrongChecksum;
+                },
+                .sha256 => {
+                    var hash_a: [Sha256.digest_length]u8 = undefined;
+                    Sha256.hash(unpacked_bytes, &hash_a, .{});
+
+                    var hash_b: [Sha256.digest_length]u8 = undefined;
+                    try self.inner_reader.readNoEof(&hash_b);
+
+                    if (!std.mem.eql(u8, &hash_a, &hash_b))
+                        return error.WrongChecksum;
+                },
+                else => return error.Unsupported,
+            }
+
+            self.block_count += 1;
+        }
+
+        fn readLzma2Chunk(self: *Self, packed_reader: anytype) Error!bool {
+            const status = try packed_reader.readByte();
+            switch (status) {
+                0 => {
+                    try self.accum.reset(self.allocator);
+                    return false;
+                },
+                1, 2 => {
+                    if (status == 1)
+                        try self.accum.reset(self.allocator);
+
+                    const size = try packed_reader.readIntBig(u16) + 1;
+                    try self.accum.ensureUnusedCapacity(self.allocator, size);
+
+                    var i: usize = 0;
+                    while (i < size) : (i += 1)
+                        self.accum.appendAssumeCapacity(try packed_reader.readByte());
+
+                    return true;
+                },
+                else => {
+                    if (status & 0x80 == 0)
+                        return error.CorruptInput;
+
+                    const Reset = struct {
+                        dict: bool,
+                        state: bool,
+                        props: bool,
+                    };
+
+                    const reset = switch ((status >> 5) & 0x3) {
+                        0 => Reset{
+                            .dict = false,
+                            .state = false,
+                            .props = false,
+                        },
+                        1 => Reset{
+                            .dict = false,
+                            .state = true,
+                            .props = false,
+                        },
+                        2 => Reset{
+                            .dict = false,
+                            .state = true,
+                            .props = true,
+                        },
+                        3 => Reset{
+                            .dict = true,
+                            .state = true,
+                            .props = true,
+                        },
+                        else => unreachable,
+                    };
+
+                    const unpacked_size = blk: {
+                        var tmp: u64 = status & 0x1F;
+                        tmp <<= 16;
+                        tmp |= try packed_reader.readIntBig(u16);
+                        break :blk tmp + 1;
+                    };
+
+                    const packed_size = blk: {
+                        const tmp: u17 = try packed_reader.readIntBig(u16);
+                        break :blk tmp + 1;
+                    };
+
+                    if (reset.dict)
+                        try self.accum.reset(self.allocator);
+
+                    if (reset.state) {
+                        var new_props = self.lzma_state.lzma_props;
+
+                        if (reset.props) {
+                            var props = try packed_reader.readByte();
+                            if (props >= 225)
+                                return error.CorruptInput;
+
+                            const lc = @intCast(u4, props % 9);
+                            props /= 9;
+                            const lp = @intCast(u3, props % 5);
+                            props /= 5;
+                            const pb = @intCast(u3, props);
+
+                            if (lc + lp > 4)
+                                return error.CorruptInput;
+
+                            new_props = .{ .lc = lc, .lp = lp, .pb = pb };
+                        }
+
+                        try self.lzma_state.reset_state(self.allocator, new_props);
+                    }
+
+                    self.lzma_state.unpacked_size = unpacked_size + self.accum.len();
+
+                    const buffer = try self.allocator.alloc(u8, packed_size);
+                    defer self.allocator.free(buffer);
+
+                    for (buffer) |*b|
+                        b.* = try packed_reader.readByte();
+
+                    var rangecoder = try lzma.RangeDecoder.init(buffer);
+                    try self.lzma_state.process(self.allocator, &self.accum, &rangecoder);
+
+                    return true;
+                },
+            }
+        }
+    };
+}
--- a/lib/std/compress/xz/lzma.zig
+++ b/lib/std/compress/xz/lzma.zig
@ -0,0 +1,658 @@
+// Ported from https://github.com/gendx/lzma-rs
+
+const std = @import("../../std.zig");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const ArrayListUnmanaged = std.ArrayListUnmanaged;
+
+const LzmaProperties = struct {
+    lc: u4,
+    lp: u3,
+    pb: u3,
+
+    fn validate(self: LzmaProperties) void {
+        assert(self.lc <= 8);
+        assert(self.lp <= 4);
+        assert(self.pb <= 4);
+    }
+};
+
+pub const DecoderState = struct {
+    lzma_props: LzmaProperties,
+    unpacked_size: ?u64,
+    literal_probs: Vec2D(u16),
+    pos_slot_decoder: [4]BitTree,
+    align_decoder: BitTree,
+    pos_decoders: [115]u16,
+    is_match: [192]u16,
+    is_rep: [12]u16,
+    is_rep_g0: [12]u16,
+    is_rep_g1: [12]u16,
+    is_rep_g2: [12]u16,
+    is_rep_0long: [192]u16,
+    state: usize,
+    rep: [4]usize,
+    len_decoder: LenDecoder,
+    rep_len_decoder: LenDecoder,
+
+    pub fn init(allocator: Allocator) !DecoderState {
+        return .{
+            .lzma_props = LzmaProperties{ .lc = 0, .lp = 0, .pb = 0 },
+            .unpacked_size = null,
+            .literal_probs = try Vec2D(u16).init(allocator, 0x400, 1, 0x300),
+            .pos_slot_decoder = .{
+                try BitTree.init(allocator, 6),
+                try BitTree.init(allocator, 6),
+                try BitTree.init(allocator, 6),
+                try BitTree.init(allocator, 6),
+            },
+            .align_decoder = try BitTree.init(allocator, 4),
+            .pos_decoders = .{0x400} ** 115,
+            .is_match = .{0x400} ** 192,
+            .is_rep = .{0x400} ** 12,
+            .is_rep_g0 = .{0x400} ** 12,
+            .is_rep_g1 = .{0x400} ** 12,
+            .is_rep_g2 = .{0x400} ** 12,
+            .is_rep_0long = .{0x400} ** 192,
+            .state = 0,
+            .rep = .{0} ** 4,
+            .len_decoder = try LenDecoder.init(allocator),
+            .rep_len_decoder = try LenDecoder.init(allocator),
+        };
+    }
+
+    pub fn deinit(self: *DecoderState, allocator: Allocator) void {
+        self.literal_probs.deinit(allocator);
+        for (self.pos_slot_decoder) |*t| t.deinit(allocator);
+        self.align_decoder.deinit(allocator);
+        self.len_decoder.deinit(allocator);
+        self.rep_len_decoder.deinit(allocator);
+    }
+
+    pub fn reset_state(self: *DecoderState, allocator: Allocator, new_props: LzmaProperties) !void {
+        new_props.validate();
+        if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) {
+            self.literal_probs.fill(0x400);
+        } else {
+            self.literal_probs.deinit(allocator);
+            self.literal_probs = try Vec2D(u16).init(allocator, 0x400, @as(usize, 1) << (new_props.lc + new_props.lp), 0x300);
+        }
+
+        self.lzma_props = new_props;
+        for (self.pos_slot_decoder) |*t| t.reset();
+        self.align_decoder.reset();
+        self.pos_decoders = .{0x400} ** 115;
+        self.is_match = .{0x400} ** 192;
+        self.is_rep = .{0x400} ** 12;
+        self.is_rep_g0 = .{0x400} ** 12;
+        self.is_rep_g1 = .{0x400} ** 12;
+        self.is_rep_g2 = .{0x400} ** 12;
+        self.is_rep_0long = .{0x400} ** 192;
+        self.state = 0;
+        self.rep = .{0} ** 4;
+        self.len_decoder.reset();
+        self.rep_len_decoder.reset();
+    }
+
+    fn processNextInner(
+        self: *DecoderState,
+        allocator: Allocator,
+        output: *LzAccumBuffer,
+        rangecoder: *RangeDecoder,
+        update: bool,
+    ) !ProcessingStatus {
+        const pos_state = output.len() & ((@as(usize, 1) << self.lzma_props.pb) - 1);
+
+        if (!try rangecoder.decodeBit(
+            &self.is_match[(self.state << 4) + pos_state],
+            update,
+        )) {
+            const byte: u8 = try self.decodeLiteral(output, rangecoder, update);
+
+            if (update) {
+                try output.appendLiteral(allocator, byte);
+
+                self.state = if (self.state < 4)
+                    0
+                else if (self.state < 10)
+                    self.state - 3
+                else
+                    self.state - 6;
+            }
+            return .continue_;
+        }
+
+        var len: usize = undefined;
+        if (try rangecoder.decodeBit(&self.is_rep[self.state], update)) {
+            if (!try rangecoder.decodeBit(&self.is_rep_g0[self.state], update)) {
+                if (!try rangecoder.decodeBit(
+                    &self.is_rep_0long[(self.state << 4) + pos_state],
+                    update,
+                )) {
+                    if (update) {
+                        self.state = if (self.state < 7) 9 else 11;
+                        const dist = self.rep[0] + 1;
+                        try output.appendLz(allocator, 1, dist);
+                    }
+                    return .continue_;
+                }
+            } else {
+                const idx: usize = if (!try rangecoder.decodeBit(&self.is_rep_g1[self.state], update))
+                    1
+                else if (!try rangecoder.decodeBit(&self.is_rep_g2[self.state], update))
+                    2
+                else
+                    3;
+                if (update) {
+                    const dist = self.rep[idx];
+                    var i = idx;
+                    while (i > 0) : (i -= 1) {
+                        self.rep[i] = self.rep[i - 1];
+                    }
+                    self.rep[0] = dist;
+                }
+            }
+
+            len = try self.rep_len_decoder.decode(rangecoder, pos_state, update);
+
+            if (update) {
+                self.state = if (self.state < 7) 8 else 11;
+            }
+        } else {
+            if (update) {
+                self.rep[3] = self.rep[2];
+                self.rep[2] = self.rep[1];
+                self.rep[1] = self.rep[0];
+            }
+
+            len = try self.len_decoder.decode(rangecoder, pos_state, update);
+
+            if (update) {
+                self.state = if (self.state < 7) 7 else 10;
+            }
+
+            const rep_0 = try self.decodeDistance(rangecoder, len, update);
+
+            if (update) {
+                self.rep[0] = rep_0;
+                if (self.rep[0] == 0xFFFF_FFFF) {
+                    if (rangecoder.isFinished()) {
+                        return .finished;
+                    }
+                    return error.CorruptInput;
+                }
+            }
+        }
+
+        if (update) {
+            len += 2;
+
+            const dist = self.rep[0] + 1;
+            try output.appendLz(allocator, len, dist);
+        }
+
+        return .continue_;
+    }
+
+    fn processNext(
+        self: *DecoderState,
+        allocator: Allocator,
+        output: *LzAccumBuffer,
+        rangecoder: *RangeDecoder,
+    ) !ProcessingStatus {
+        return self.processNextInner(allocator, output, rangecoder, true);
+    }
+
+    pub fn process(
+        self: *DecoderState,
+        allocator: Allocator,
+        output: *LzAccumBuffer,
+        rangecoder: *RangeDecoder,
+    ) !void {
+        while (true) {
+            if (self.unpacked_size) |unpacked_size| {
+                if (output.len() >= unpacked_size) {
+                    break;
+                }
+            } else if (rangecoder.isFinished()) {
+                break;
+            }
+
+            if (try self.processNext(allocator, output, rangecoder) == .finished) {
+                break;
+            }
+        }
+
+        if (self.unpacked_size) |len| {
+            if (len != output.len()) {
+                return error.CorruptInput;
+            }
+        }
+    }
+
+    fn decodeLiteral(
+        self: *DecoderState,
+        output: *LzAccumBuffer,
+        rangecoder: *RangeDecoder,
+        update: bool,
+    ) !u8 {
+        const def_prev_byte = 0;
+        const prev_byte = @as(usize, output.lastOr(def_prev_byte));
+
+        var result: usize = 1;
+        const lit_state = ((output.len() & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) +
+            (prev_byte >> (8 - self.lzma_props.lc));
+        const probs = try self.literal_probs.get(lit_state);
+
+        if (self.state >= 7) {
+            var match_byte = @as(usize, try output.lastN(self.rep[0] + 1));
+
+            while (result < 0x100) {
+                const match_bit = (match_byte >> 7) & 1;
+                match_byte <<= 1;
+                const bit = @boolToInt(try rangecoder.decodeBit(
+                    &probs[((@as(usize, 1) + match_bit) << 8) + result],
+                    update,
+                ));
+                result = (result << 1) ^ bit;
+                if (match_bit != bit) {
+                    break;
+                }
+            }
+        }
+
+        while (result < 0x100) {
+            result = (result << 1) ^ @boolToInt(try rangecoder.decodeBit(&probs[result], update));
+        }
+
+        return @truncate(u8, result - 0x100);
+    }
+
+    fn decodeDistance(
+        self: *DecoderState,
+        rangecoder: *RangeDecoder,
+        length: usize,
+        update: bool,
+    ) !usize {
+        const len_state = if (length > 3) 3 else length;
+
+        const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(rangecoder, update));
+        if (pos_slot < 4)
+            return pos_slot;
+
+        const num_direct_bits = @intCast(u5, (pos_slot >> 1) - 1);
+        var result = (2 ^ (pos_slot & 1)) << num_direct_bits;
+
+        if (pos_slot < 14) {
+            result += try rangecoder.parseReverseBitTree(
+                num_direct_bits,
+                &self.pos_decoders,
+                result - pos_slot,
+                update,
+            );
+        } else {
+            result += @as(usize, try rangecoder.get(num_direct_bits - 4)) << 4;
+            result += try self.align_decoder.parseReverse(rangecoder, update);
+        }
+
+        return result;
+    }
+};
+
+const ProcessingStatus = enum {
+    continue_,
+    finished,
+};
+
+pub const LzAccumBuffer = struct {
+    to_read: ArrayListUnmanaged(u8) = .{},
+    buf: ArrayListUnmanaged(u8) = .{},
+
+    pub fn deinit(self: *LzAccumBuffer, allocator: Allocator) void {
+        self.to_read.deinit(allocator);
+        self.buf.deinit(allocator);
+    }
+
+    pub fn read(self: *LzAccumBuffer, output: []u8) usize {
+        const input = self.to_read.items;
+        const n = std.math.min(input.len, output.len);
+        std.mem.copy(u8, output[0..n], input[0..n]);
+        std.mem.copy(u8, input, input[n..]);
+        self.to_read.shrinkRetainingCapacity(input.len - n);
+        return n;
+    }
+
+    pub fn ensureUnusedCapacity(
+        self: *LzAccumBuffer,
+        allocator: Allocator,
+        additional_count: usize,
+    ) !void {
+        try self.buf.ensureUnusedCapacity(allocator, additional_count);
+    }
+
+    pub fn appendAssumeCapacity(self: *LzAccumBuffer, byte: u8) void {
+        self.buf.appendAssumeCapacity(byte);
+    }
+
+    pub fn reset(self: *LzAccumBuffer, allocator: Allocator) !void {
+        try self.to_read.appendSlice(allocator, self.buf.items);
+        self.buf.clearRetainingCapacity();
+    }
+
+    pub fn len(self: *const LzAccumBuffer) usize {
+        return self.buf.items.len;
+    }
+
+    pub fn lastOr(self: *const LzAccumBuffer, lit: u8) u8 {
+        const buf_len = self.buf.items.len;
+        return if (buf_len == 0)
+            lit
+        else
+            self.buf.items[buf_len - 1];
+    }
+
+    pub fn lastN(self: *const LzAccumBuffer, dist: usize) !u8 {
+        const buf_len = self.buf.items.len;
+        if (dist > buf_len) {
+            return error.CorruptInput;
+        }
+
+        return self.buf.items[buf_len - dist];
+    }
+
+    pub fn appendLiteral(self: *LzAccumBuffer, allocator: Allocator, lit: u8) !void {
+        try self.buf.append(allocator, lit);
+    }
+
+    pub fn appendLz(self: *LzAccumBuffer, allocator: Allocator, length: usize, dist: usize) !void {
+        const buf_len = self.buf.items.len;
+        if (dist > buf_len) {
+            return error.CorruptInput;
+        }
+
+        var offset = buf_len - dist;
+        var i: usize = 0;
+        while (i < length) : (i += 1) {
+            const x = self.buf.items[offset];
+            try self.buf.append(allocator, x);
+            offset += 1;
+        }
+    }
+};
+
+pub const RangeDecoder = struct {
+    stream: std.io.FixedBufferStream([]const u8),
+    range: u32,
+    code: u32,
+
+    pub fn init(buffer: []const u8) !RangeDecoder {
+        var dec = RangeDecoder{
+            .stream = std.io.fixedBufferStream(buffer),
+            .range = 0xFFFF_FFFF,
+            .code = 0,
+        };
+        const reader = dec.stream.reader();
+        _ = try reader.readByte();
+        dec.code = try reader.readIntBig(u32);
+        return dec;
+    }
+
+    pub fn fromParts(
+        buffer: []const u8,
+        range: u32,
+        code: u32,
+    ) RangeDecoder {
+        return .{
+            .stream = std.io.fixedBufferStream(buffer),
+            .range = range,
+            .code = code,
+        };
+    }
+
+    pub fn set(self: *RangeDecoder, range: u32, code: u32) void {
+        self.range = range;
+        self.code = code;
+    }
+
+    pub fn readInto(self: *RangeDecoder, dest: []u8) !usize {
+        return self.stream.read(dest);
+    }
+
+    pub inline fn isFinished(self: *const RangeDecoder) bool {
+        return self.code == 0 and self.isEof();
+    }
+
+    pub inline fn isEof(self: *const RangeDecoder) bool {
+        return self.stream.pos == self.stream.buffer.len;
+    }
+
+    inline fn normalize(self: *RangeDecoder) !void {
+        if (self.range < 0x0100_0000) {
+            self.range <<= 8;
+            self.code = (self.code << 8) ^ @as(u32, try self.stream.reader().readByte());
+        }
+    }
+
+    inline fn getBit(self: *RangeDecoder) !bool {
+        self.range >>= 1;
+
+        const bit = self.code >= self.range;
+        if (bit)
+            self.code -= self.range;
+
+        try self.normalize();
+        return bit;
+    }
+
+    fn get(self: *RangeDecoder, count: usize) !u32 {
+        var result: u32 = 0;
+        var i: usize = 0;
+        while (i < count) : (i += 1)
+            result = (result << 1) ^ @boolToInt(try self.getBit());
+        return result;
+    }
+
+    pub inline fn decodeBit(self: *RangeDecoder, prob: *u16, update: bool) !bool {
+        const bound = (self.range >> 11) * prob.*;
+
+        if (self.code < bound) {
+            if (update)
+                prob.* += (0x800 - prob.*) >> 5;
+            self.range = bound;
+
+            try self.normalize();
+            return false;
+        } else {
+            if (update)
+                prob.* -= prob.* >> 5;
+            self.code -= bound;
+            self.range -= bound;
+
+            try self.normalize();
+            return true;
+        }
+    }
+
+    fn parseBitTree(
+        self: *RangeDecoder,
+        num_bits: u5,
+        probs: []u16,
+        update: bool,
+    ) !u32 {
+        var tmp: u32 = 1;
+        var i: u5 = 0;
+        while (i < num_bits) : (i += 1) {
+            const bit = try self.decodeBit(&probs[tmp], update);
+            tmp = (tmp << 1) ^ @boolToInt(bit);
+        }
+        return tmp - (@as(u32, 1) << num_bits);
+    }
+
+    pub fn parseReverseBitTree(
+        self: *RangeDecoder,
+        num_bits: u5,
+        probs: []u16,
+        offset: usize,
+        update: bool,
+    ) !u32 {
+        var result: u32 = 0;
+        var tmp: usize = 1;
+        var i: u5 = 0;
+        while (i < num_bits) : (i += 1) {
+            const bit = @boolToInt(try self.decodeBit(&probs[offset + tmp], update));
+            tmp = (tmp << 1) ^ bit;
+            result ^= @as(u32, bit) << i;
+        }
+        return result;
+    }
+};
+
+fn Vec2D(comptime T: type) type {
+    return struct {
+        data: []T,
+        cols: usize,
+
+        const Self = @This();
+
+        pub fn init(allocator: Allocator, data: T, rows: usize, cols: usize) !Self {
+            const len = try std.math.mul(usize, rows, cols);
+            var vec2d = Self{
+                .data = try allocator.alloc(T, len),
+                .cols = cols,
+            };
+            vec2d.fill(data);
+            return vec2d;
+        }
+
+        pub fn deinit(self: *Self, allocator: Allocator) void {
+            allocator.free(self.data);
+        }
+
+        pub fn fill(self: *Self, value: T) void {
+            std.mem.set(T, self.data, value);
+        }
+
+        pub fn get(self: *Self, row: usize) ![]T {
+            const start_row = try std.math.mul(usize, row, self.cols);
+            return self.data[start_row .. start_row + self.cols];
+        }
+    };
+}
+
+const BitTree = struct {
+    num_bits: u5,
+    probs: ArrayListUnmanaged(u16),
+
+    pub fn init(allocator: Allocator, num_bits: u5) !BitTree {
+        var probs_len = @as(usize, 1) << num_bits;
+        var probs = try ArrayListUnmanaged(u16).initCapacity(allocator, probs_len);
+        while (probs_len > 0) : (probs_len -= 1)
+            probs.appendAssumeCapacity(0x400);
+        return .{ .num_bits = num_bits, .probs = probs };
+    }
+
+    pub fn deinit(self: *BitTree, allocator: Allocator) void {
+        self.probs.deinit(allocator);
+    }
+
+    pub fn parse(
+        self: *BitTree,
+        rangecoder: *RangeDecoder,
+        update: bool,
+    ) !u32 {
+        return rangecoder.parseBitTree(self.num_bits, self.probs.items, update);
+    }
+
+    pub fn parseReverse(
+        self: *BitTree,
+        rangecoder: *RangeDecoder,
+        update: bool,
+    ) !u32 {
+        return rangecoder.parseReverseBitTree(self.num_bits, self.probs.items, 0, update);
+    }
+
+    pub fn reset(self: *BitTree) void {
+        std.mem.set(u16, self.probs.items, 0x400);
+    }
+};
+
+const LenDecoder = struct {
+    choice: u16,
+    choice2: u16,
+    low_coder: [16]BitTree,
+    mid_coder: [16]BitTree,
+    high_coder: BitTree,
+
+    pub fn init(allocator: Allocator) !LenDecoder {
+        return .{
+            .choice = 0x400,
+            .choice2 = 0x400,
+            .low_coder = .{
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+            },
+            .mid_coder = .{
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+                try BitTree.init(allocator, 3),
+            },
+            .high_coder = try BitTree.init(allocator, 8),
+        };
+    }
+
+    pub fn deinit(self: *LenDecoder, allocator: Allocator) void {
+        for (self.low_coder) |*t| t.deinit(allocator);
+        for (self.mid_coder) |*t| t.deinit(allocator);
+        self.high_coder.deinit(allocator);
+    }
+
+    pub fn decode(
+        self: *LenDecoder,
+        rangecoder: *RangeDecoder,
+        pos_state: usize,
+        update: bool,
+    ) !usize {
+        if (!try rangecoder.decodeBit(&self.choice, update)) {
+            return @as(usize, try self.low_coder[pos_state].parse(rangecoder, update));
+        } else if (!try rangecoder.decodeBit(&self.choice2, update)) {
+            return @as(usize, try self.mid_coder[pos_state].parse(rangecoder, update)) + 8;
+        } else {
+            return @as(usize, try self.high_coder.parse(rangecoder, update)) + 16;
+        }
+    }
+
+    pub fn reset(self: *LenDecoder) void {
+        self.choice = 0x400;
+        self.choice2 = 0x400;
+        for (self.low_coder) |*t| t.reset();
+        for (self.mid_coder) |*t| t.reset();
+        self.high_coder.reset();
+    }
+};
--- a/lib/std/compress/xz/test.zig
+++ b/lib/std/compress/xz/test.zig
@ -0,0 +1,80 @@
+const std = @import("../../std.zig");
+const testing = std.testing;
+const xz = std.compress.xz;
+
+fn decompress(data: []const u8) ![]u8 {
+    var in_stream = std.io.fixedBufferStream(data);
+
+    var xz_stream = try xz.decompress(testing.allocator, in_stream.reader());
+    defer xz_stream.deinit();
+
+    return xz_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
+}
+
+fn testReader(data: []const u8, comptime expected: []const u8) !void {
+    const buf = try decompress(data);
+    defer testing.allocator.free(buf);
+
+    try testing.expectEqualSlices(u8, expected, buf);
+}
+
+test "compressed data" {
+    try testReader(@embedFile("testdata/good-0-empty.xz"), "");
+
+    inline for ([_][]const u8{
+        "good-1-check-none.xz",
+        "good-1-check-crc32.xz",
+        "good-1-check-crc64.xz",
+        "good-1-check-sha256.xz",
+        "good-2-lzma2.xz",
+        "good-1-block_header-1.xz",
+        "good-1-block_header-2.xz",
+        "good-1-block_header-3.xz",
+    }) |filename| {
+        try testReader(@embedFile("testdata/" ++ filename),
+            \\Hello
+            \\World!
+            \\
+        );
+    }
+
+    inline for ([_][]const u8{
+        "good-1-lzma2-1.xz",
+        "good-1-lzma2-2.xz",
+        "good-1-lzma2-3.xz",
+        "good-1-lzma2-4.xz",
+    }) |filename| {
+        try testReader(@embedFile("testdata/" ++ filename),
+            \\Lorem ipsum dolor sit amet, consectetur adipisicing 
+            \\elit, sed do eiusmod tempor incididunt ut 
+            \\labore et dolore magna aliqua. Ut enim 
+            \\ad minim veniam, quis nostrud exercitation ullamco 
+            \\laboris nisi ut aliquip ex ea commodo 
+            \\consequat. Duis aute irure dolor in reprehenderit 
+            \\in voluptate velit esse cillum dolore eu 
+            \\fugiat nulla pariatur. Excepteur sint occaecat cupidatat 
+            \\non proident, sunt in culpa qui officia 
+            \\deserunt mollit anim id est laborum. 
+            \\
+        );
+    }
+
+    try testReader(@embedFile("testdata/good-1-lzma2-5.xz"), "");
+}
+
+test "unsupported" {
+    inline for ([_][]const u8{
+        "good-1-delta-lzma2.tiff.xz",
+        "good-1-x86-lzma2.xz",
+        "good-1-sparc-lzma2.xz",
+        "good-1-arm64-lzma2-1.xz",
+        "good-1-arm64-lzma2-2.xz",
+        "good-1-3delta-lzma2.xz",
+        "good-1-empty-bcj-lzma2.xz",
+    }) |filename| {
+        try testing.expectError(
+            error.Unsupported,
+            decompress(@embedFile("testdata/" ++ filename)),
+        );
+    }
+}
--- a/lib/std/compress/xz/testdata/good-0-empty.xz
+++ b/lib/std/compress/xz/testdata/good-0-empty.xz
--- a/lib/std/compress/xz/testdata/good-0cat-empty.xz
+++ b/lib/std/compress/xz/testdata/good-0cat-empty.xz
--- a/lib/std/compress/xz/testdata/good-0catpad-empty.xz
+++ b/lib/std/compress/xz/testdata/good-0catpad-empty.xz
--- a/lib/std/compress/xz/testdata/good-0pad-empty.xz
+++ b/lib/std/compress/xz/testdata/good-0pad-empty.xz
--- a/lib/std/compress/xz/testdata/good-1-3delta-lzma2.xz
+++ b/lib/std/compress/xz/testdata/good-1-3delta-lzma2.xz
--- a/lib/std/compress/xz/testdata/good-1-arm64-lzma2-1.xz
+++ b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-1.xz
--- a/lib/std/compress/xz/testdata/good-1-arm64-lzma2-2.xz
+++ b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-2.xz
--- a/lib/std/compress/xz/testdata/good-1-block_header-1.xz
+++ b/lib/std/compress/xz/testdata/good-1-block_header-1.xz
--- a/lib/std/compress/xz/testdata/good-1-block_header-2.xz
+++ b/lib/std/compress/xz/testdata/good-1-block_header-2.xz
--- a/lib/std/compress/xz/testdata/good-1-block_header-3.xz
+++ b/lib/std/compress/xz/testdata/good-1-block_header-3.xz
--- a/lib/std/compress/xz/testdata/good-1-check-crc32.xz
+++ b/lib/std/compress/xz/testdata/good-1-check-crc32.xz
--- a/lib/std/compress/xz/testdata/good-1-check-crc64.xz
+++ b/lib/std/compress/xz/testdata/good-1-check-crc64.xz
--- a/lib/std/compress/xz/testdata/good-1-check-none.xz
+++ b/lib/std/compress/xz/testdata/good-1-check-none.xz
--- a/lib/std/compress/xz/testdata/good-1-check-sha256.xz
+++ b/lib/std/compress/xz/testdata/good-1-check-sha256.xz
--- a/lib/std/compress/xz/testdata/good-1-delta-lzma2.tiff.xz
+++ b/lib/std/compress/xz/testdata/good-1-delta-lzma2.tiff.xz
--- a/lib/std/compress/xz/testdata/good-1-empty-bcj-lzma2.xz
+++ b/lib/std/compress/xz/testdata/good-1-empty-bcj-lzma2.xz
--- a/lib/std/compress/xz/testdata/good-1-lzma2-1.xz
+++ b/lib/std/compress/xz/testdata/good-1-lzma2-1.xz
--- a/lib/std/compress/xz/testdata/good-1-lzma2-2.xz
+++ b/lib/std/compress/xz/testdata/good-1-lzma2-2.xz
--- a/lib/std/compress/xz/testdata/good-1-lzma2-3.xz
+++ b/lib/std/compress/xz/testdata/good-1-lzma2-3.xz
--- a/lib/std/compress/xz/testdata/good-1-lzma2-4.xz
+++ b/lib/std/compress/xz/testdata/good-1-lzma2-4.xz
--- a/lib/std/compress/xz/testdata/good-1-lzma2-5.xz
+++ b/lib/std/compress/xz/testdata/good-1-lzma2-5.xz
--- a/lib/std/compress/xz/testdata/good-1-sparc-lzma2.xz
+++ b/lib/std/compress/xz/testdata/good-1-sparc-lzma2.xz
--- a/lib/std/compress/xz/testdata/good-1-x86-lzma2.xz
+++ b/lib/std/compress/xz/testdata/good-1-x86-lzma2.xz
--- a/lib/std/compress/xz/testdata/good-2-lzma2.xz
+++ b/lib/std/compress/xz/testdata/good-2-lzma2.xz
--- a/src/Package.zig
+++ b/src/Package.zig
@ -370,14 +370,11 @@ fn fetchAndUnpack(
        if (mem.endsWith(u8, uri.path, ".tar.gz")) {
            // I observed the gzip stream to read 1 byte at a time, so I am using a
            // buffered reader on the front of it.
-            var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, req.reader());
-
-            var gzip_stream = try std.compress.gzip.gzipStream(gpa, br.reader());
-            defer gzip_stream.deinit();
-
-            try std.tar.pipeToFileSystem(tmp_directory.handle, gzip_stream.reader(), .{
-                .strip_components = 1,
-            });
+            try unpackTarball(gpa, &req, tmp_directory.handle, std.compress.gzip);
+        } else if (mem.endsWith(u8, uri.path, ".tar.xz")) {
+            // I have not checked what buffer sizes the xz decompression implementation uses
+            // by default, so the same logic applies for buffering the reader as for gzip.
+            try unpackTarball(gpa, &req, tmp_directory.handle, std.compress.xz);
        } else {
            return reportError(
                ini,
@ -430,6 +427,22 @@ fn fetchAndUnpack(
    return createWithDir(gpa, fqn, global_cache_directory, pkg_dir_sub_path, build_zig_basename);
 }

+fn unpackTarball(
+    gpa: Allocator,
+    req: *std.http.Client.Request,
+    out_dir: fs.Dir,
+    comptime compression: type,
+) !void {
+    var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, req.reader());
+
+    var decompress = try compression.decompress(gpa, br.reader());
+    defer decompress.deinit();
+
+    try std.tar.pipeToFileSystem(out_dir, decompress.reader(), .{
+        .strip_components = 1,
+    });
+}
+
 fn reportError(
    ini: std.Ini,
    comp_directory: Compilation.Directory,