zig/lib/compiler/resinator/windows1252.zig
Ryan Liptak 52de2802c4 Lazily compile the zig rc subcommand and use it during zig build-exe
This moves .rc/.manifest compilation out of the main Zig binary, contributing towards #19063

Also:
- Make resinator use Aro as its preprocessor instead of clang
- Sync resinator with upstream
2024-03-11 05:06:16 -07:00

588 lines
31 KiB
Zig

const std = @import("std");
pub fn windows1252ToUtf8Stream(writer: anytype, reader: anytype) !usize {
var bytes_written: usize = 0;
var utf8_buf: [3]u8 = undefined;
while (true) {
const c = reader.readByte() catch |err| switch (err) {
error.EndOfStream => return bytes_written,
else => |e| return e,
};
const codepoint = toCodepoint(c);
if (codepoint <= 0x7F) {
try writer.writeByte(c);
bytes_written += 1;
} else {
const utf8_len = std.unicode.utf8Encode(codepoint, &utf8_buf) catch unreachable;
try writer.writeAll(utf8_buf[0..utf8_len]);
bytes_written += utf8_len;
}
}
}
/// Returns the number of code units written to the writer
pub fn windows1252ToUtf16AllocZ(allocator: std.mem.Allocator, win1252_str: []const u8) ![:0]u16 {
// Guaranteed to need exactly the same number of code units as Windows-1252 bytes
var utf16_slice = try allocator.allocSentinel(u16, win1252_str.len, 0);
errdefer allocator.free(utf16_slice);
for (win1252_str, 0..) |c, i| {
utf16_slice[i] = toCodepoint(c);
}
return utf16_slice;
}
/// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt
pub fn toCodepoint(c: u8) u16 {
return switch (c) {
0x80 => 0x20ac, // Euro Sign
0x82 => 0x201a, // Single Low-9 Quotation Mark
0x83 => 0x0192, // Latin Small Letter F With Hook
0x84 => 0x201e, // Double Low-9 Quotation Mark
0x85 => 0x2026, // Horizontal Ellipsis
0x86 => 0x2020, // Dagger
0x87 => 0x2021, // Double Dagger
0x88 => 0x02c6, // Modifier Letter Circumflex Accent
0x89 => 0x2030, // Per Mille Sign
0x8a => 0x0160, // Latin Capital Letter S With Caron
0x8b => 0x2039, // Single Left-Pointing Angle Quotation Mark
0x8c => 0x0152, // Latin Capital Ligature Oe
0x8e => 0x017d, // Latin Capital Letter Z With Caron
0x91 => 0x2018, // Left Single Quotation Mark
0x92 => 0x2019, // Right Single Quotation Mark
0x93 => 0x201c, // Left Double Quotation Mark
0x94 => 0x201d, // Right Double Quotation Mark
0x95 => 0x2022, // Bullet
0x96 => 0x2013, // En Dash
0x97 => 0x2014, // Em Dash
0x98 => 0x02dc, // Small Tilde
0x99 => 0x2122, // Trade Mark Sign
0x9a => 0x0161, // Latin Small Letter S With Caron
0x9b => 0x203a, // Single Right-Pointing Angle Quotation Mark
0x9c => 0x0153, // Latin Small Ligature Oe
0x9e => 0x017e, // Latin Small Letter Z With Caron
0x9f => 0x0178, // Latin Capital Letter Y With Diaeresis
else => c,
};
}
/// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt
/// Plus some mappings found empirically by iterating all codepoints:
/// 0x2007 => 0xA0, // Figure Space
/// 0x2008 => ' ', // Punctuation Space
/// 0x2009 => ' ', // Thin Space
/// 0x200A => ' ', // Hair Space
/// 0x2012 => '-', // Figure Dash
/// 0x2015 => '-', // Horizontal Bar
/// 0x201B => '\'', // Single High-reversed-9 Quotation Mark
/// 0x201F => '"', // Double High-reversed-9 Quotation Mark
/// 0x202F => 0xA0, // Narrow No-Break Space
/// 0x2033 => '"', // Double Prime
/// 0x2036 => '"', // Reversed Double Prime
pub fn bestFitFromCodepoint(codepoint: u21) ?u8 {
return switch (codepoint) {
0x00...0x7F,
0x81,
0x8D,
0x8F,
0x90,
0x9D,
0xA0...0xFF,
=> @intCast(codepoint),
0x0100 => 0x41, // Latin Capital Letter A With Macron
0x0101 => 0x61, // Latin Small Letter A With Macron
0x0102 => 0x41, // Latin Capital Letter A With Breve
0x0103 => 0x61, // Latin Small Letter A With Breve
0x0104 => 0x41, // Latin Capital Letter A With Ogonek
0x0105 => 0x61, // Latin Small Letter A With Ogonek
0x0106 => 0x43, // Latin Capital Letter C With Acute
0x0107 => 0x63, // Latin Small Letter C With Acute
0x0108 => 0x43, // Latin Capital Letter C With Circumflex
0x0109 => 0x63, // Latin Small Letter C With Circumflex
0x010a => 0x43, // Latin Capital Letter C With Dot Above
0x010b => 0x63, // Latin Small Letter C With Dot Above
0x010c => 0x43, // Latin Capital Letter C With Caron
0x010d => 0x63, // Latin Small Letter C With Caron
0x010e => 0x44, // Latin Capital Letter D With Caron
0x010f => 0x64, // Latin Small Letter D With Caron
0x0110 => 0xd0, // Latin Capital Letter D With Stroke
0x0111 => 0x64, // Latin Small Letter D With Stroke
0x0112 => 0x45, // Latin Capital Letter E With Macron
0x0113 => 0x65, // Latin Small Letter E With Macron
0x0114 => 0x45, // Latin Capital Letter E With Breve
0x0115 => 0x65, // Latin Small Letter E With Breve
0x0116 => 0x45, // Latin Capital Letter E With Dot Above
0x0117 => 0x65, // Latin Small Letter E With Dot Above
0x0118 => 0x45, // Latin Capital Letter E With Ogonek
0x0119 => 0x65, // Latin Small Letter E With Ogonek
0x011a => 0x45, // Latin Capital Letter E With Caron
0x011b => 0x65, // Latin Small Letter E With Caron
0x011c => 0x47, // Latin Capital Letter G With Circumflex
0x011d => 0x67, // Latin Small Letter G With Circumflex
0x011e => 0x47, // Latin Capital Letter G With Breve
0x011f => 0x67, // Latin Small Letter G With Breve
0x0120 => 0x47, // Latin Capital Letter G With Dot Above
0x0121 => 0x67, // Latin Small Letter G With Dot Above
0x0122 => 0x47, // Latin Capital Letter G With Cedilla
0x0123 => 0x67, // Latin Small Letter G With Cedilla
0x0124 => 0x48, // Latin Capital Letter H With Circumflex
0x0125 => 0x68, // Latin Small Letter H With Circumflex
0x0126 => 0x48, // Latin Capital Letter H With Stroke
0x0127 => 0x68, // Latin Small Letter H With Stroke
0x0128 => 0x49, // Latin Capital Letter I With Tilde
0x0129 => 0x69, // Latin Small Letter I With Tilde
0x012a => 0x49, // Latin Capital Letter I With Macron
0x012b => 0x69, // Latin Small Letter I With Macron
0x012c => 0x49, // Latin Capital Letter I With Breve
0x012d => 0x69, // Latin Small Letter I With Breve
0x012e => 0x49, // Latin Capital Letter I With Ogonek
0x012f => 0x69, // Latin Small Letter I With Ogonek
0x0130 => 0x49, // Latin Capital Letter I With Dot Above
0x0131 => 0x69, // Latin Small Letter Dotless I
0x0134 => 0x4a, // Latin Capital Letter J With Circumflex
0x0135 => 0x6a, // Latin Small Letter J With Circumflex
0x0136 => 0x4b, // Latin Capital Letter K With Cedilla
0x0137 => 0x6b, // Latin Small Letter K With Cedilla
0x0139 => 0x4c, // Latin Capital Letter L With Acute
0x013a => 0x6c, // Latin Small Letter L With Acute
0x013b => 0x4c, // Latin Capital Letter L With Cedilla
0x013c => 0x6c, // Latin Small Letter L With Cedilla
0x013d => 0x4c, // Latin Capital Letter L With Caron
0x013e => 0x6c, // Latin Small Letter L With Caron
0x0141 => 0x4c, // Latin Capital Letter L With Stroke
0x0142 => 0x6c, // Latin Small Letter L With Stroke
0x0143 => 0x4e, // Latin Capital Letter N With Acute
0x0144 => 0x6e, // Latin Small Letter N With Acute
0x0145 => 0x4e, // Latin Capital Letter N With Cedilla
0x0146 => 0x6e, // Latin Small Letter N With Cedilla
0x0147 => 0x4e, // Latin Capital Letter N With Caron
0x0148 => 0x6e, // Latin Small Letter N With Caron
0x014c => 0x4f, // Latin Capital Letter O With Macron
0x014d => 0x6f, // Latin Small Letter O With Macron
0x014e => 0x4f, // Latin Capital Letter O With Breve
0x014f => 0x6f, // Latin Small Letter O With Breve
0x0150 => 0x4f, // Latin Capital Letter O With Double Acute
0x0151 => 0x6f, // Latin Small Letter O With Double Acute
0x0152 => 0x8c, // Latin Capital Ligature Oe
0x0153 => 0x9c, // Latin Small Ligature Oe
0x0154 => 0x52, // Latin Capital Letter R With Acute
0x0155 => 0x72, // Latin Small Letter R With Acute
0x0156 => 0x52, // Latin Capital Letter R With Cedilla
0x0157 => 0x72, // Latin Small Letter R With Cedilla
0x0158 => 0x52, // Latin Capital Letter R With Caron
0x0159 => 0x72, // Latin Small Letter R With Caron
0x015a => 0x53, // Latin Capital Letter S With Acute
0x015b => 0x73, // Latin Small Letter S With Acute
0x015c => 0x53, // Latin Capital Letter S With Circumflex
0x015d => 0x73, // Latin Small Letter S With Circumflex
0x015e => 0x53, // Latin Capital Letter S With Cedilla
0x015f => 0x73, // Latin Small Letter S With Cedilla
0x0160 => 0x8a, // Latin Capital Letter S With Caron
0x0161 => 0x9a, // Latin Small Letter S With Caron
0x0162 => 0x54, // Latin Capital Letter T With Cedilla
0x0163 => 0x74, // Latin Small Letter T With Cedilla
0x0164 => 0x54, // Latin Capital Letter T With Caron
0x0165 => 0x74, // Latin Small Letter T With Caron
0x0166 => 0x54, // Latin Capital Letter T With Stroke
0x0167 => 0x74, // Latin Small Letter T With Stroke
0x0168 => 0x55, // Latin Capital Letter U With Tilde
0x0169 => 0x75, // Latin Small Letter U With Tilde
0x016a => 0x55, // Latin Capital Letter U With Macron
0x016b => 0x75, // Latin Small Letter U With Macron
0x016c => 0x55, // Latin Capital Letter U With Breve
0x016d => 0x75, // Latin Small Letter U With Breve
0x016e => 0x55, // Latin Capital Letter U With Ring Above
0x016f => 0x75, // Latin Small Letter U With Ring Above
0x0170 => 0x55, // Latin Capital Letter U With Double Acute
0x0171 => 0x75, // Latin Small Letter U With Double Acute
0x0172 => 0x55, // Latin Capital Letter U With Ogonek
0x0173 => 0x75, // Latin Small Letter U With Ogonek
0x0174 => 0x57, // Latin Capital Letter W With Circumflex
0x0175 => 0x77, // Latin Small Letter W With Circumflex
0x0176 => 0x59, // Latin Capital Letter Y With Circumflex
0x0177 => 0x79, // Latin Small Letter Y With Circumflex
0x0178 => 0x9f, // Latin Capital Letter Y With Diaeresis
0x0179 => 0x5a, // Latin Capital Letter Z With Acute
0x017a => 0x7a, // Latin Small Letter Z With Acute
0x017b => 0x5a, // Latin Capital Letter Z With Dot Above
0x017c => 0x7a, // Latin Small Letter Z With Dot Above
0x017d => 0x8e, // Latin Capital Letter Z With Caron
0x017e => 0x9e, // Latin Small Letter Z With Caron
0x0180 => 0x62, // Latin Small Letter B With Stroke
0x0189 => 0xd0, // Latin Capital Letter African D
0x0191 => 0x83, // Latin Capital Letter F With Hook
0x0192 => 0x83, // Latin Small Letter F With Hook
0x0197 => 0x49, // Latin Capital Letter I With Stroke
0x019a => 0x6c, // Latin Small Letter L With Bar
0x019f => 0x4f, // Latin Capital Letter O With Middle Tilde
0x01a0 => 0x4f, // Latin Capital Letter O With Horn
0x01a1 => 0x6f, // Latin Small Letter O With Horn
0x01ab => 0x74, // Latin Small Letter T With Palatal Hook
0x01ae => 0x54, // Latin Capital Letter T With Retroflex Hook
0x01af => 0x55, // Latin Capital Letter U With Horn
0x01b0 => 0x75, // Latin Small Letter U With Horn
0x01b6 => 0x7a, // Latin Small Letter Z With Stroke
0x01c0 => 0x7c, // Latin Letter Dental Click
0x01c3 => 0x21, // Latin Letter Retroflex Click
0x01cd => 0x41, // Latin Capital Letter A With Caron
0x01ce => 0x61, // Latin Small Letter A With Caron
0x01cf => 0x49, // Latin Capital Letter I With Caron
0x01d0 => 0x69, // Latin Small Letter I With Caron
0x01d1 => 0x4f, // Latin Capital Letter O With Caron
0x01d2 => 0x6f, // Latin Small Letter O With Caron
0x01d3 => 0x55, // Latin Capital Letter U With Caron
0x01d4 => 0x75, // Latin Small Letter U With Caron
0x01d5 => 0x55, // Latin Capital Letter U With Diaeresis And Macron
0x01d6 => 0x75, // Latin Small Letter U With Diaeresis And Macron
0x01d7 => 0x55, // Latin Capital Letter U With Diaeresis And Acute
0x01d8 => 0x75, // Latin Small Letter U With Diaeresis And Acute
0x01d9 => 0x55, // Latin Capital Letter U With Diaeresis And Caron
0x01da => 0x75, // Latin Small Letter U With Diaeresis And Caron
0x01db => 0x55, // Latin Capital Letter U With Diaeresis And Grave
0x01dc => 0x75, // Latin Small Letter U With Diaeresis And Grave
0x01de => 0x41, // Latin Capital Letter A With Diaeresis And Macron
0x01df => 0x61, // Latin Small Letter A With Diaeresis And Macron
0x01e4 => 0x47, // Latin Capital Letter G With Stroke
0x01e5 => 0x67, // Latin Small Letter G With Stroke
0x01e6 => 0x47, // Latin Capital Letter G With Caron
0x01e7 => 0x67, // Latin Small Letter G With Caron
0x01e8 => 0x4b, // Latin Capital Letter K With Caron
0x01e9 => 0x6b, // Latin Small Letter K With Caron
0x01ea => 0x4f, // Latin Capital Letter O With Ogonek
0x01eb => 0x6f, // Latin Small Letter O With Ogonek
0x01ec => 0x4f, // Latin Capital Letter O With Ogonek And Macron
0x01ed => 0x6f, // Latin Small Letter O With Ogonek And Macron
0x01f0 => 0x6a, // Latin Small Letter J With Caron
0x0261 => 0x67, // Latin Small Letter Script G
0x02b9 => 0x27, // Modifier Letter Prime
0x02ba => 0x22, // Modifier Letter Double Prime
0x02bc => 0x27, // Modifier Letter Apostrophe
0x02c4 => 0x5e, // Modifier Letter Up Arrowhead
0x02c6 => 0x88, // Modifier Letter Circumflex Accent
0x02c8 => 0x27, // Modifier Letter Vertical Line
0x02c9 => 0xaf, // Modifier Letter Macron
0x02ca => 0xb4, // Modifier Letter Acute Accent
0x02cb => 0x60, // Modifier Letter Grave Accent
0x02cd => 0x5f, // Modifier Letter Low Macron
0x02da => 0xb0, // Ring Above
0x02dc => 0x98, // Small Tilde
0x0300 => 0x60, // Combining Grave Accent
0x0301 => 0xb4, // Combining Acute Accent
0x0302 => 0x5e, // Combining Circumflex Accent
0x0303 => 0x7e, // Combining Tilde
0x0304 => 0xaf, // Combining Macron
0x0305 => 0xaf, // Combining Overline
0x0308 => 0xa8, // Combining Diaeresis
0x030a => 0xb0, // Combining Ring Above
0x030e => 0x22, // Combining Double Vertical Line Above
0x0327 => 0xb8, // Combining Cedilla
0x0331 => 0x5f, // Combining Macron Below
0x0332 => 0x5f, // Combining Low Line
0x037e => 0x3b, // Greek Question Mark
0x0393 => 0x47, // Greek Capital Letter Gamma
0x0398 => 0x54, // Greek Capital Letter Theta
0x03a3 => 0x53, // Greek Capital Letter Sigma
0x03a6 => 0x46, // Greek Capital Letter Phi
0x03a9 => 0x4f, // Greek Capital Letter Omega
0x03b1 => 0x61, // Greek Small Letter Alpha
0x03b2 => 0xdf, // Greek Small Letter Beta
0x03b4 => 0x64, // Greek Small Letter Delta
0x03b5 => 0x65, // Greek Small Letter Epsilon
0x03bc => 0xb5, // Greek Small Letter Mu
0x03c0 => 0x70, // Greek Small Letter Pi
0x03c3 => 0x73, // Greek Small Letter Sigma
0x03c4 => 0x74, // Greek Small Letter Tau
0x03c6 => 0x66, // Greek Small Letter Phi
0x04bb => 0x68, // Cyrillic Small Letter Shha
0x0589 => 0x3a, // Armenian Full Stop
0x066a => 0x25, // Arabic Percent Sign
0x2000 => 0x20, // En Quad
0x2001 => 0x20, // Em Quad
0x2002 => 0x20, // En Space
0x2003 => 0x20, // Em Space
0x2004 => 0x20, // Three-Per-Em Space
0x2005 => 0x20, // Four-Per-Em Space
0x2006 => 0x20, // Six-Per-Em Space
0x2010 => 0x2d, // Hyphen
0x2011 => 0x2d, // Non-Breaking Hyphen
0x2013 => 0x96, // En Dash
0x2014 => 0x97, // Em Dash
0x2017 => 0x3d, // Double Low Line
0x2018 => 0x91, // Left Single Quotation Mark
0x2019 => 0x92, // Right Single Quotation Mark
0x201a => 0x82, // Single Low-9 Quotation Mark
0x201c => 0x93, // Left Double Quotation Mark
0x201d => 0x94, // Right Double Quotation Mark
0x201e => 0x84, // Double Low-9 Quotation Mark
0x2020 => 0x86, // Dagger
0x2021 => 0x87, // Double Dagger
0x2022 => 0x95, // Bullet
0x2024 => 0xb7, // One Dot Leader
0x2026 => 0x85, // Horizontal Ellipsis
0x2030 => 0x89, // Per Mille Sign
0x2032 => 0x27, // Prime
0x2035 => 0x60, // Reversed Prime
0x2039 => 0x8b, // Single Left-Pointing Angle Quotation Mark
0x203a => 0x9b, // Single Right-Pointing Angle Quotation Mark
0x2044 => 0x2f, // Fraction Slash
0x2070 => 0xb0, // Superscript Zero
0x2074 => 0x34, // Superscript Four
0x2075 => 0x35, // Superscript Five
0x2076 => 0x36, // Superscript Six
0x2077 => 0x37, // Superscript Seven
0x2078 => 0x38, // Superscript Eight
0x207f => 0x6e, // Superscript Latin Small Letter N
0x2080 => 0x30, // Subscript Zero
0x2081 => 0x31, // Subscript One
0x2082 => 0x32, // Subscript Two
0x2083 => 0x33, // Subscript Three
0x2084 => 0x34, // Subscript Four
0x2085 => 0x35, // Subscript Five
0x2086 => 0x36, // Subscript Six
0x2087 => 0x37, // Subscript Seven
0x2088 => 0x38, // Subscript Eight
0x2089 => 0x39, // Subscript Nine
0x20ac => 0x80, // Euro Sign
0x20a1 => 0xa2, // Colon Sign
0x20a4 => 0xa3, // Lira Sign
0x20a7 => 0x50, // Peseta Sign
0x2102 => 0x43, // Double-Struck Capital C
0x2107 => 0x45, // Euler Constant
0x210a => 0x67, // Script Small G
0x210b => 0x48, // Script Capital H
0x210c => 0x48, // Black-Letter Capital H
0x210d => 0x48, // Double-Struck Capital H
0x210e => 0x68, // Planck Constant
0x2110 => 0x49, // Script Capital I
0x2111 => 0x49, // Black-Letter Capital I
0x2112 => 0x4c, // Script Capital L
0x2113 => 0x6c, // Script Small L
0x2115 => 0x4e, // Double-Struck Capital N
0x2118 => 0x50, // Script Capital P
0x2119 => 0x50, // Double-Struck Capital P
0x211a => 0x51, // Double-Struck Capital Q
0x211b => 0x52, // Script Capital R
0x211c => 0x52, // Black-Letter Capital R
0x211d => 0x52, // Double-Struck Capital R
0x2122 => 0x99, // Trade Mark Sign
0x2124 => 0x5a, // Double-Struck Capital Z
0x2128 => 0x5a, // Black-Letter Capital Z
0x212a => 0x4b, // Kelvin Sign
0x212b => 0xc5, // Angstrom Sign
0x212c => 0x42, // Script Capital B
0x212d => 0x43, // Black-Letter Capital C
0x212e => 0x65, // Estimated Symbol
0x212f => 0x65, // Script Small E
0x2130 => 0x45, // Script Capital E
0x2131 => 0x46, // Script Capital F
0x2133 => 0x4d, // Script Capital M
0x2134 => 0x6f, // Script Small O
0x2205 => 0xd8, // Empty Set
0x2212 => 0x2d, // Minus Sign
0x2213 => 0xb1, // Minus-Or-Plus Sign
0x2215 => 0x2f, // Division Slash
0x2216 => 0x5c, // Set Minus
0x2217 => 0x2a, // Asterisk Operator
0x2218 => 0xb0, // Ring Operator
0x2219 => 0xb7, // Bullet Operator
0x221a => 0x76, // Square Root
0x221e => 0x38, // Infinity
0x2223 => 0x7c, // Divides
0x2229 => 0x6e, // Intersection
0x2236 => 0x3a, // Ratio
0x223c => 0x7e, // Tilde Operator
0x2248 => 0x98, // Almost Equal To
0x2261 => 0x3d, // Identical To
0x2264 => 0x3d, // Less-Than Or Equal To
0x2265 => 0x3d, // Greater-Than Or Equal To
0x226a => 0xab, // Much Less-Than
0x226b => 0xbb, // Much Greater-Than
0x22c5 => 0xb7, // Dot Operator
0x2302 => 0xa6, // House
0x2303 => 0x5e, // Up Arrowhead
0x2310 => 0xac, // Reversed Not Sign
0x2320 => 0x28, // Top Half Integral
0x2321 => 0x29, // Bottom Half Integral
0x2329 => 0x3c, // Left-Pointing Angle Bracket
0x232a => 0x3e, // Right-Pointing Angle Bracket
0x2500 => 0x2d, // Box Drawings Light Horizontal
0x2502 => 0xa6, // Box Drawings Light Vertical
0x250c => 0x2b, // Box Drawings Light Down And Right
0x2510 => 0x2b, // Box Drawings Light Down And Left
0x2514 => 0x2b, // Box Drawings Light Up And Right
0x2518 => 0x2b, // Box Drawings Light Up And Left
0x251c => 0x2b, // Box Drawings Light Vertical And Right
0x2524 => 0xa6, // Box Drawings Light Vertical And Left
0x252c => 0x2d, // Box Drawings Light Down And Horizontal
0x2534 => 0x2d, // Box Drawings Light Up And Horizontal
0x253c => 0x2b, // Box Drawings Light Vertical And Horizontal
0x2550 => 0x2d, // Box Drawings Double Horizontal
0x2551 => 0xa6, // Box Drawings Double Vertical
0x2552 => 0x2b, // Box Drawings Down Single And Right Double
0x2553 => 0x2b, // Box Drawings Down Double And Right Single
0x2554 => 0x2b, // Box Drawings Double Down And Right
0x2555 => 0x2b, // Box Drawings Down Single And Left Double
0x2556 => 0x2b, // Box Drawings Down Double And Left Single
0x2557 => 0x2b, // Box Drawings Double Down And Left
0x2558 => 0x2b, // Box Drawings Up Single And Right Double
0x2559 => 0x2b, // Box Drawings Up Double And Right Single
0x255a => 0x2b, // Box Drawings Double Up And Right
0x255b => 0x2b, // Box Drawings Up Single And Left Double
0x255c => 0x2b, // Box Drawings Up Double And Left Single
0x255d => 0x2b, // Box Drawings Double Up And Left
0x255e => 0xa6, // Box Drawings Vertical Single And Right Double
0x255f => 0xa6, // Box Drawings Vertical Double And Right Single
0x2560 => 0xa6, // Box Drawings Double Vertical And Right
0x2561 => 0xa6, // Box Drawings Vertical Single And Left Double
0x2562 => 0xa6, // Box Drawings Vertical Double And Left Single
0x2563 => 0xa6, // Box Drawings Double Vertical And Left
0x2564 => 0x2d, // Box Drawings Down Single And Horizontal Double
0x2565 => 0x2d, // Box Drawings Down Double And Horizontal Single
0x2566 => 0x2d, // Box Drawings Double Down And Horizontal
0x2567 => 0x2d, // Box Drawings Up Single And Horizontal Double
0x2568 => 0x2d, // Box Drawings Up Double And Horizontal Single
0x2569 => 0x2d, // Box Drawings Double Up And Horizontal
0x256a => 0x2b, // Box Drawings Vertical Single And Horizontal Double
0x256b => 0x2b, // Box Drawings Vertical Double And Horizontal Single
0x256c => 0x2b, // Box Drawings Double Vertical And Horizontal
0x2580 => 0xaf, // Upper Half Block
0x2584 => 0x5f, // Lower Half Block
0x2588 => 0xa6, // Full Block
0x258c => 0xa6, // Left Half Block
0x2590 => 0xa6, // Right Half Block
0x2591 => 0xa6, // Light Shade
0x2592 => 0xa6, // Medium Shade
0x2593 => 0xa6, // Dark Shade
0x25a0 => 0xa6, // Black Square
0x263c => 0xa4, // White Sun With Rays
0x2758 => 0x7c, // Light Vertical Bar
0x3000 => 0x20, // Ideographic Space
0x3008 => 0x3c, // Left Angle Bracket
0x3009 => 0x3e, // Right Angle Bracket
0x300a => 0xab, // Left Double Angle Bracket
0x300b => 0xbb, // Right Double Angle Bracket
0x301a => 0x5b, // Left White Square Bracket
0x301b => 0x5d, // Right White Square Bracket
0x30fb => 0xb7, // Katakana Middle Dot
0xff01 => 0x21, // Fullwidth Exclamation Mark
0xff02 => 0x22, // Fullwidth Quotation Mark
0xff03 => 0x23, // Fullwidth Number Sign
0xff04 => 0x24, // Fullwidth Dollar Sign
0xff05 => 0x25, // Fullwidth Percent Sign
0xff06 => 0x26, // Fullwidth Ampersand
0xff07 => 0x27, // Fullwidth Apostrophe
0xff08 => 0x28, // Fullwidth Left Parenthesis
0xff09 => 0x29, // Fullwidth Right Parenthesis
0xff0a => 0x2a, // Fullwidth Asterisk
0xff0b => 0x2b, // Fullwidth Plus Sign
0xff0c => 0x2c, // Fullwidth Comma
0xff0d => 0x2d, // Fullwidth Hyphen-Minus
0xff0e => 0x2e, // Fullwidth Full Stop
0xff0f => 0x2f, // Fullwidth Solidus
0xff10 => 0x30, // Fullwidth Digit Zero
0xff11 => 0x31, // Fullwidth Digit One
0xff12 => 0x32, // Fullwidth Digit Two
0xff13 => 0x33, // Fullwidth Digit Three
0xff14 => 0x34, // Fullwidth Digit Four
0xff15 => 0x35, // Fullwidth Digit Five
0xff16 => 0x36, // Fullwidth Digit Six
0xff17 => 0x37, // Fullwidth Digit Seven
0xff18 => 0x38, // Fullwidth Digit Eight
0xff19 => 0x39, // Fullwidth Digit Nine
0xff1a => 0x3a, // Fullwidth Colon
0xff1b => 0x3b, // Fullwidth Semicolon
0xff1c => 0x3c, // Fullwidth Less-Than Sign
0xff1d => 0x3d, // Fullwidth Equals Sign
0xff1e => 0x3e, // Fullwidth Greater-Than Sign
0xff1f => 0x3f, // Fullwidth Question Mark
0xff20 => 0x40, // Fullwidth Commercial At
0xff21 => 0x41, // Fullwidth Latin Capital Letter A
0xff22 => 0x42, // Fullwidth Latin Capital Letter B
0xff23 => 0x43, // Fullwidth Latin Capital Letter C
0xff24 => 0x44, // Fullwidth Latin Capital Letter D
0xff25 => 0x45, // Fullwidth Latin Capital Letter E
0xff26 => 0x46, // Fullwidth Latin Capital Letter F
0xff27 => 0x47, // Fullwidth Latin Capital Letter G
0xff28 => 0x48, // Fullwidth Latin Capital Letter H
0xff29 => 0x49, // Fullwidth Latin Capital Letter I
0xff2a => 0x4a, // Fullwidth Latin Capital Letter J
0xff2b => 0x4b, // Fullwidth Latin Capital Letter K
0xff2c => 0x4c, // Fullwidth Latin Capital Letter L
0xff2d => 0x4d, // Fullwidth Latin Capital Letter M
0xff2e => 0x4e, // Fullwidth Latin Capital Letter N
0xff2f => 0x4f, // Fullwidth Latin Capital Letter O
0xff30 => 0x50, // Fullwidth Latin Capital Letter P
0xff31 => 0x51, // Fullwidth Latin Capital Letter Q
0xff32 => 0x52, // Fullwidth Latin Capital Letter R
0xff33 => 0x53, // Fullwidth Latin Capital Letter S
0xff34 => 0x54, // Fullwidth Latin Capital Letter T
0xff35 => 0x55, // Fullwidth Latin Capital Letter U
0xff36 => 0x56, // Fullwidth Latin Capital Letter V
0xff37 => 0x57, // Fullwidth Latin Capital Letter W
0xff38 => 0x58, // Fullwidth Latin Capital Letter X
0xff39 => 0x59, // Fullwidth Latin Capital Letter Y
0xff3a => 0x5a, // Fullwidth Latin Capital Letter Z
0xff3b => 0x5b, // Fullwidth Left Square Bracket
0xff3c => 0x5c, // Fullwidth Reverse Solidus
0xff3d => 0x5d, // Fullwidth Right Square Bracket
0xff3e => 0x5e, // Fullwidth Circumflex Accent
0xff3f => 0x5f, // Fullwidth Low Line
0xff40 => 0x60, // Fullwidth Grave Accent
0xff41 => 0x61, // Fullwidth Latin Small Letter A
0xff42 => 0x62, // Fullwidth Latin Small Letter B
0xff43 => 0x63, // Fullwidth Latin Small Letter C
0xff44 => 0x64, // Fullwidth Latin Small Letter D
0xff45 => 0x65, // Fullwidth Latin Small Letter E
0xff46 => 0x66, // Fullwidth Latin Small Letter F
0xff47 => 0x67, // Fullwidth Latin Small Letter G
0xff48 => 0x68, // Fullwidth Latin Small Letter H
0xff49 => 0x69, // Fullwidth Latin Small Letter I
0xff4a => 0x6a, // Fullwidth Latin Small Letter J
0xff4b => 0x6b, // Fullwidth Latin Small Letter K
0xff4c => 0x6c, // Fullwidth Latin Small Letter L
0xff4d => 0x6d, // Fullwidth Latin Small Letter M
0xff4e => 0x6e, // Fullwidth Latin Small Letter N
0xff4f => 0x6f, // Fullwidth Latin Small Letter O
0xff50 => 0x70, // Fullwidth Latin Small Letter P
0xff51 => 0x71, // Fullwidth Latin Small Letter Q
0xff52 => 0x72, // Fullwidth Latin Small Letter R
0xff53 => 0x73, // Fullwidth Latin Small Letter S
0xff54 => 0x74, // Fullwidth Latin Small Letter T
0xff55 => 0x75, // Fullwidth Latin Small Letter U
0xff56 => 0x76, // Fullwidth Latin Small Letter V
0xff57 => 0x77, // Fullwidth Latin Small Letter W
0xff58 => 0x78, // Fullwidth Latin Small Letter X
0xff59 => 0x79, // Fullwidth Latin Small Letter Y
0xff5a => 0x7a, // Fullwidth Latin Small Letter Z
0xff5b => 0x7b, // Fullwidth Left Curly Bracket
0xff5c => 0x7c, // Fullwidth Vertical Line
0xff5d => 0x7d, // Fullwidth Right Curly Bracket
0xff5e => 0x7e, // Fullwidth Tilde
// Not in the best fit mapping, but RC uses these mappings too
0x2007 => 0xA0, // Figure Space
0x2008 => ' ', // Punctuation Space
0x2009 => ' ', // Thin Space
0x200A => ' ', // Hair Space
0x2012 => '-', // Figure Dash
0x2015 => '-', // Horizontal Bar
0x201B => '\'', // Single High-reversed-9 Quotation Mark
0x201F => '"', // Double High-reversed-9 Quotation Mark
0x202F => 0xA0, // Narrow No-Break Space
0x2033 => '"', // Double Prime
0x2036 => '"', // Reversed Double Prime
else => null,
};
}
test "windows-1252 to utf8" {
var buf = std.ArrayList(u8).init(std.testing.allocator);
defer buf.deinit();
const input_windows1252 = "\x81pqrstuvwxyz{|}~\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8e\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9e\x9f\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
const expected_utf8 = "\xc2\x81pqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
var fbs = std.io.fixedBufferStream(input_windows1252);
const bytes_written = try windows1252ToUtf8Stream(buf.writer(), fbs.reader());
try std.testing.expectEqualStrings(expected_utf8, buf.items);
try std.testing.expectEqual(expected_utf8.len, bytes_written);
}