From 4baa448335a8158d424a1e648c907fa7566f7060 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Fri, 17 Jan 2025 23:02:40 -0800 Subject: [PATCH 1/2] resinator: Fix incorrect args being printed in cli diagnostics --- lib/compiler/resinator/main.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compiler/resinator/main.zig b/lib/compiler/resinator/main.zig index a918081226f5..6973c6e6b592 100644 --- a/lib/compiler/resinator/main.zig +++ b/lib/compiler/resinator/main.zig @@ -64,7 +64,7 @@ pub fn main() !void { if (!zig_integration) { // print any warnings/notes - cli_diagnostics.renderToStdErr(args, stderr_config); + cli_diagnostics.renderToStdErr(cli_args, stderr_config); // If there was something printed, then add an extra newline separator // so that there is a clear separation between the cli diagnostics and whatever // gets printed after From 289e9c3507d622bd42a6dee4df8a69dd71a7dfe6 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Fri, 17 Jan 2025 19:07:46 -0800 Subject: [PATCH 2/2] resinator: Sync with upstream Note: This mostly matches resinator v0.1.0 rather than the latest master version, since the latest master version focuses on adding support for .res -> .obj conversion which is not necessary for the future planned relationship of zig and resinator (resinator will likely be moved out of the compiler and into the build system, a la translate-c). So, ultimately the changes here consist mostly of bug fixes for obscure edge cases. --- lib/compiler/resinator/ast.zig | 98 +-- lib/compiler/resinator/bmp.zig | 13 +- lib/compiler/resinator/cli.zig | 127 ++-- lib/compiler/resinator/code_pages.zig | 217 +++---- lib/compiler/resinator/comments.zig | 25 + lib/compiler/resinator/compile.zig | 312 ++++----- lib/compiler/resinator/disjoint_code_page.zig | 99 +++ lib/compiler/resinator/errors.zig | 595 ++++++++++++------ lib/compiler/resinator/lang.zig | 2 +- lib/compiler/resinator/lex.zig | 247 ++++---- lib/compiler/resinator/literals.zig | 373 ++++++++--- lib/compiler/resinator/main.zig | 34 +- lib/compiler/resinator/parse.zig | 243 +++++-- lib/compiler/resinator/preprocess.zig | 1 + lib/compiler/resinator/rc.zig | 14 +- lib/compiler/resinator/res.zig | 213 +++++-- lib/compiler/resinator/source_mapping.zig | 479 ++++++++++++-- 17 files changed, 2144 insertions(+), 948 deletions(-) create mode 100644 lib/compiler/resinator/disjoint_code_page.zig diff --git a/lib/compiler/resinator/ast.zig b/lib/compiler/resinator/ast.zig index 31250ea71e2c..20eedb652d19 100644 --- a/lib/compiler/resinator/ast.zig +++ b/lib/compiler/resinator/ast.zig @@ -1,7 +1,7 @@ const std = @import("std"); const Allocator = std.mem.Allocator; const Token = @import("lex.zig").Token; -const CodePage = @import("code_pages.zig").CodePage; +const SupportedCodePage = @import("code_pages.zig").SupportedCodePage; pub const Tree = struct { node: *Node, @@ -28,11 +28,11 @@ pub const Tree = struct { }; pub const CodePageLookup = struct { - lookup: std.ArrayListUnmanaged(CodePage) = .empty, + lookup: std.ArrayListUnmanaged(SupportedCodePage) = .empty, allocator: Allocator, - default_code_page: CodePage, + default_code_page: SupportedCodePage, - pub fn init(allocator: Allocator, default_code_page: CodePage) CodePageLookup { + pub fn init(allocator: Allocator, default_code_page: SupportedCodePage) CodePageLookup { return .{ .allocator = allocator, .default_code_page = default_code_page, @@ -44,7 +44,7 @@ pub const CodePageLookup = struct { } /// line_num is 1-indexed - pub fn setForLineNum(self: *CodePageLookup, line_num: usize, code_page: CodePage) !void { + pub fn setForLineNum(self: *CodePageLookup, line_num: usize, code_page: SupportedCodePage) !void { const index = line_num - 1; if (index >= self.lookup.items.len) { const new_size = line_num; @@ -66,16 +66,16 @@ pub const CodePageLookup = struct { self.lookup.items[index] = code_page; } - pub fn setForToken(self: *CodePageLookup, token: Token, code_page: CodePage) !void { + pub fn setForToken(self: *CodePageLookup, token: Token, code_page: SupportedCodePage) !void { return self.setForLineNum(token.line_number, code_page); } /// line_num is 1-indexed - pub fn getForLineNum(self: CodePageLookup, line_num: usize) CodePage { + pub fn getForLineNum(self: CodePageLookup, line_num: usize) SupportedCodePage { return self.lookup.items[line_num - 1]; } - pub fn getForToken(self: CodePageLookup, token: Token) CodePage { + pub fn getForToken(self: CodePageLookup, token: Token) SupportedCodePage { return self.getForLineNum(token.line_number); } }; @@ -85,21 +85,21 @@ test "CodePageLookup" { defer lookup.deinit(); try lookup.setForLineNum(5, .utf8); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(1)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(2)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(3)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(4)); - try std.testing.expectEqual(CodePage.utf8, lookup.getForLineNum(5)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(1)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(2)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(3)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(4)); + try std.testing.expectEqual(SupportedCodePage.utf8, lookup.getForLineNum(5)); try std.testing.expectEqual(@as(usize, 5), lookup.lookup.items.len); try lookup.setForLineNum(7, .windows1252); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(1)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(2)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(3)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(4)); - try std.testing.expectEqual(CodePage.utf8, lookup.getForLineNum(5)); - try std.testing.expectEqual(CodePage.utf8, lookup.getForLineNum(6)); - try std.testing.expectEqual(CodePage.windows1252, lookup.getForLineNum(7)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(1)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(2)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(3)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(4)); + try std.testing.expectEqual(SupportedCodePage.utf8, lookup.getForLineNum(5)); + try std.testing.expectEqual(SupportedCodePage.utf8, lookup.getForLineNum(6)); + try std.testing.expectEqual(SupportedCodePage.windows1252, lookup.getForLineNum(7)); try std.testing.expectEqual(@as(usize, 7), lookup.lookup.items.len); } @@ -734,31 +734,31 @@ pub const Node = struct { switch (node.id) { .root => { try writer.writeAll("\n"); - const root: *Node.Root = @alignCast(@fieldParentPtr("base", node)); + const root: *const Node.Root = @alignCast(@fieldParentPtr("base", node)); for (root.body) |body_node| { try body_node.dump(tree, writer, indent + 1); } }, .resource_external => { - const resource: *Node.ResourceExternal = @alignCast(@fieldParentPtr("base", node)); + const resource: *const Node.ResourceExternal = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ resource.id.slice(tree.source), resource.type.slice(tree.source), resource.common_resource_attributes.len }); try resource.filename.dump(tree, writer, indent + 1); }, .resource_raw_data => { - const resource: *Node.ResourceRawData = @alignCast(@fieldParentPtr("base", node)); + const resource: *const Node.ResourceRawData = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes] raw data: {}\n", .{ resource.id.slice(tree.source), resource.type.slice(tree.source), resource.common_resource_attributes.len, resource.raw_data.len }); for (resource.raw_data) |data_expression| { try data_expression.dump(tree, writer, indent + 1); } }, .literal => { - const literal: *Node.Literal = @alignCast(@fieldParentPtr("base", node)); + const literal: *const Node.Literal = @alignCast(@fieldParentPtr("base", node)); try writer.writeAll(" "); try writer.writeAll(literal.token.slice(tree.source)); try writer.writeAll("\n"); }, .binary_expression => { - const binary: *Node.BinaryExpression = @alignCast(@fieldParentPtr("base", node)); + const binary: *const Node.BinaryExpression = @alignCast(@fieldParentPtr("base", node)); try writer.writeAll(" "); try writer.writeAll(binary.operator.slice(tree.source)); try writer.writeAll("\n"); @@ -766,7 +766,7 @@ pub const Node = struct { try binary.right.dump(tree, writer, indent + 1); }, .grouped_expression => { - const grouped: *Node.GroupedExpression = @alignCast(@fieldParentPtr("base", node)); + const grouped: *const Node.GroupedExpression = @alignCast(@fieldParentPtr("base", node)); try writer.writeAll("\n"); try writer.writeByteNTimes(' ', indent); try writer.writeAll(grouped.open_token.slice(tree.source)); @@ -777,7 +777,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .not_expression => { - const not: *Node.NotExpression = @alignCast(@fieldParentPtr("base", node)); + const not: *const Node.NotExpression = @alignCast(@fieldParentPtr("base", node)); try writer.writeAll(" "); try writer.writeAll(not.not_token.slice(tree.source)); try writer.writeAll(" "); @@ -785,7 +785,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .accelerators => { - const accelerators: *Node.Accelerators = @alignCast(@fieldParentPtr("base", node)); + const accelerators: *const Node.Accelerators = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ accelerators.id.slice(tree.source), accelerators.type.slice(tree.source), accelerators.common_resource_attributes.len }); for (accelerators.optional_statements) |statement| { try statement.dump(tree, writer, indent + 1); @@ -801,7 +801,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .accelerator => { - const accelerator: *Node.Accelerator = @alignCast(@fieldParentPtr("base", node)); + const accelerator: *const Node.Accelerator = @alignCast(@fieldParentPtr("base", node)); for (accelerator.type_and_options, 0..) |option, i| { if (i != 0) try writer.writeAll(","); try writer.writeByte(' '); @@ -812,7 +812,7 @@ pub const Node = struct { try accelerator.idvalue.dump(tree, writer, indent + 1); }, .dialog => { - const dialog: *Node.Dialog = @alignCast(@fieldParentPtr("base", node)); + const dialog: *const Node.Dialog = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ dialog.id.slice(tree.source), dialog.type.slice(tree.source), dialog.common_resource_attributes.len }); inline for (.{ "x", "y", "width", "height" }) |arg| { try writer.writeByteNTimes(' ', indent + 1); @@ -838,7 +838,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .control_statement => { - const control: *Node.ControlStatement = @alignCast(@fieldParentPtr("base", node)); + const control: *const Node.ControlStatement = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s}", .{control.type.slice(tree.source)}); if (control.text) |text| { try writer.print(" text: {s}", .{text.slice(tree.source)}); @@ -874,7 +874,7 @@ pub const Node = struct { } }, .toolbar => { - const toolbar: *Node.Toolbar = @alignCast(@fieldParentPtr("base", node)); + const toolbar: *const Node.Toolbar = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ toolbar.id.slice(tree.source), toolbar.type.slice(tree.source), toolbar.common_resource_attributes.len }); inline for (.{ "button_width", "button_height" }) |arg| { try writer.writeByteNTimes(' ', indent + 1); @@ -892,7 +892,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .menu => { - const menu: *Node.Menu = @alignCast(@fieldParentPtr("base", node)); + const menu: *const Node.Menu = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ menu.id.slice(tree.source), menu.type.slice(tree.source), menu.common_resource_attributes.len }); for (menu.optional_statements) |statement| { try statement.dump(tree, writer, indent + 1); @@ -913,16 +913,16 @@ pub const Node = struct { try writer.writeAll("\n"); }, .menu_item => { - const menu_item: *Node.MenuItem = @alignCast(@fieldParentPtr("base", node)); + const menu_item: *const Node.MenuItem = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} options]\n", .{ menu_item.menuitem.slice(tree.source), menu_item.text.slice(tree.source), menu_item.option_list.len }); try menu_item.result.dump(tree, writer, indent + 1); }, .menu_item_separator => { - const menu_item: *Node.MenuItemSeparator = @alignCast(@fieldParentPtr("base", node)); + const menu_item: *const Node.MenuItemSeparator = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s}\n", .{ menu_item.menuitem.slice(tree.source), menu_item.separator.slice(tree.source) }); }, .menu_item_ex => { - const menu_item: *Node.MenuItemEx = @alignCast(@fieldParentPtr("base", node)); + const menu_item: *const Node.MenuItemEx = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s}\n", .{ menu_item.menuitem.slice(tree.source), menu_item.text.slice(tree.source) }); inline for (.{ "id", "type", "state" }) |arg| { if (@field(menu_item, arg)) |val_node| { @@ -933,7 +933,7 @@ pub const Node = struct { } }, .popup => { - const popup: *Node.Popup = @alignCast(@fieldParentPtr("base", node)); + const popup: *const Node.Popup = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} options]\n", .{ popup.popup.slice(tree.source), popup.text.slice(tree.source), popup.option_list.len }); try writer.writeByteNTimes(' ', indent); try writer.writeAll(popup.begin_token.slice(tree.source)); @@ -946,7 +946,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .popup_ex => { - const popup: *Node.PopupEx = @alignCast(@fieldParentPtr("base", node)); + const popup: *const Node.PopupEx = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s}\n", .{ popup.popup.slice(tree.source), popup.text.slice(tree.source) }); inline for (.{ "id", "type", "state", "help_id" }) |arg| { if (@field(popup, arg)) |val_node| { @@ -966,7 +966,7 @@ pub const Node = struct { try writer.writeAll("\n"); }, .version_info => { - const version_info: *Node.VersionInfo = @alignCast(@fieldParentPtr("base", node)); + const version_info: *const Node.VersionInfo = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s} [{d} common_resource_attributes]\n", .{ version_info.id.slice(tree.source), version_info.versioninfo.slice(tree.source), version_info.common_resource_attributes.len }); for (version_info.fixed_info) |fixed_info| { try fixed_info.dump(tree, writer, indent + 1); @@ -982,14 +982,14 @@ pub const Node = struct { try writer.writeAll("\n"); }, .version_statement => { - const version_statement: *Node.VersionStatement = @alignCast(@fieldParentPtr("base", node)); + const version_statement: *const Node.VersionStatement = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s}\n", .{version_statement.type.slice(tree.source)}); for (version_statement.parts) |part| { try part.dump(tree, writer, indent + 1); } }, .block => { - const block: *Node.Block = @alignCast(@fieldParentPtr("base", node)); + const block: *const Node.Block = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s}\n", .{ block.identifier.slice(tree.source), block.key.slice(tree.source) }); for (block.values) |value| { try value.dump(tree, writer, indent + 1); @@ -1005,14 +1005,14 @@ pub const Node = struct { try writer.writeAll("\n"); }, .block_value => { - const block_value: *Node.BlockValue = @alignCast(@fieldParentPtr("base", node)); + const block_value: *const Node.BlockValue = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} {s}\n", .{ block_value.identifier.slice(tree.source), block_value.key.slice(tree.source) }); for (block_value.values) |value| { try value.dump(tree, writer, indent + 1); } }, .block_value_value => { - const block_value: *Node.BlockValueValue = @alignCast(@fieldParentPtr("base", node)); + const block_value: *const Node.BlockValueValue = @alignCast(@fieldParentPtr("base", node)); if (block_value.trailing_comma) { try writer.writeAll(" ,"); } @@ -1020,7 +1020,7 @@ pub const Node = struct { try block_value.expression.dump(tree, writer, indent + 1); }, .string_table => { - const string_table: *Node.StringTable = @alignCast(@fieldParentPtr("base", node)); + const string_table: *const Node.StringTable = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} [{d} common_resource_attributes]\n", .{ string_table.type.slice(tree.source), string_table.common_resource_attributes.len }); for (string_table.optional_statements) |statement| { try statement.dump(tree, writer, indent + 1); @@ -1037,19 +1037,19 @@ pub const Node = struct { }, .string_table_string => { try writer.writeAll("\n"); - const string: *Node.StringTableString = @alignCast(@fieldParentPtr("base", node)); + const string: *const Node.StringTableString = @alignCast(@fieldParentPtr("base", node)); try string.id.dump(tree, writer, indent + 1); try writer.writeByteNTimes(' ', indent + 1); try writer.print("{s}\n", .{string.string.slice(tree.source)}); }, .language_statement => { - const language: *Node.LanguageStatement = @alignCast(@fieldParentPtr("base", node)); + const language: *const Node.LanguageStatement = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s}\n", .{language.language_token.slice(tree.source)}); try language.primary_language_id.dump(tree, writer, indent + 1); try language.sublanguage_id.dump(tree, writer, indent + 1); }, .font_statement => { - const font: *Node.FontStatement = @alignCast(@fieldParentPtr("base", node)); + const font: *const Node.FontStatement = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s} typeface: {s}\n", .{ font.identifier.slice(tree.source), font.typeface.slice(tree.source) }); try writer.writeByteNTimes(' ', indent + 1); try writer.writeAll("point_size:\n"); @@ -1063,12 +1063,12 @@ pub const Node = struct { } }, .simple_statement => { - const statement: *Node.SimpleStatement = @alignCast(@fieldParentPtr("base", node)); + const statement: *const Node.SimpleStatement = @alignCast(@fieldParentPtr("base", node)); try writer.print(" {s}\n", .{statement.identifier.slice(tree.source)}); try statement.value.dump(tree, writer, indent + 1); }, .invalid => { - const invalid: *Node.Invalid = @alignCast(@fieldParentPtr("base", node)); + const invalid: *const Node.Invalid = @alignCast(@fieldParentPtr("base", node)); try writer.print(" context.len: {}\n", .{invalid.context.len}); for (invalid.context) |context_token| { try writer.writeByteNTimes(' ', indent + 1); diff --git a/lib/compiler/resinator/bmp.zig b/lib/compiler/resinator/bmp.zig index 03a8638ad75d..c9a0c29da065 100644 --- a/lib/compiler/resinator/bmp.zig +++ b/lib/compiler/resinator/bmp.zig @@ -60,9 +60,16 @@ pub const BitmapInfo = struct { } pub fn getBitmasksByteLen(self: *const BitmapInfo) u8 { - return switch (self.compression) { - .BI_BITFIELDS => 12, - .BI_ALPHABITFIELDS => 16, + // Only BITMAPINFOHEADER (3.1) has trailing bytes for the BITFIELDS + // The 2.0 format doesn't have a compression field and 4.0+ has dedicated + // fields for the masks in the header. + const dib_version = BitmapHeader.Version.get(self.dib_header_size); + return switch (dib_version) { + .@"nt3.1" => switch (self.compression) { + .BI_BITFIELDS => 12, + .BI_ALPHABITFIELDS => 16, + else => 0, + }, else => 0, }; } diff --git a/lib/compiler/resinator/cli.zig b/lib/compiler/resinator/cli.zig index 1223b069d76d..a002c9c61574 100644 --- a/lib/compiler/resinator/cli.zig +++ b/lib/compiler/resinator/cli.zig @@ -1,5 +1,6 @@ const std = @import("std"); -const CodePage = @import("code_pages.zig").CodePage; +const code_pages = @import("code_pages.zig"); +const SupportedCodePage = code_pages.SupportedCodePage; const lang = @import("lang.zig"); const res = @import("res.zig"); const Allocator = std.mem.Allocator; @@ -14,6 +15,8 @@ pub const usage_string_after_command_name = \\The sequence -- can be used to signify when to stop parsing options. \\This is necessary when the input path begins with a forward slash. \\ + \\Supported option prefixes are /, -, and --, so e.g. /h, -h, and --h all work. + \\ \\Supported Win32 RC Options: \\ /?, /h Print this help and exit. \\ /v Verbose (print progress messages). @@ -56,8 +59,6 @@ pub const usage_string_after_command_name = \\ the .rc includes or otherwise depends on. \\ /:depfile-fmt Output format of the depfile, if /:depfile is set. \\ json (default) A top-level JSON array of paths - \\ /:mingw-includes Path to a directory containing MinGW include files. If - \\ not specified, bundled MinGW include files will be used. \\ \\Note: For compatibility reasons, all custom options start with : \\ @@ -136,7 +137,7 @@ pub const Options = struct { ignore_include_env_var: bool = false, preprocess: Preprocess = .yes, default_language_id: ?u16 = null, - default_code_page: ?CodePage = null, + default_code_page: ?SupportedCodePage = null, verbose: bool = false, symbols: std.StringArrayHashMapUnmanaged(SymbolValue) = .empty, null_terminate_string_table_strings: bool = false, @@ -148,7 +149,6 @@ pub const Options = struct { auto_includes: AutoIncludes = .any, depfile_path: ?[]const u8 = null, depfile_fmt: DepfileFormat = .json, - mingw_includes_dir: ?[]const u8 = null, pub const AutoIncludes = enum { any, msvc, gnu, none }; pub const DepfileFormat = enum { json }; @@ -243,9 +243,6 @@ pub const Options = struct { if (self.depfile_path) |depfile_path| { self.allocator.free(depfile_path); } - if (self.mingw_includes_dir) |mingw_includes_dir| { - self.allocator.free(mingw_includes_dir); - } } pub fn dumpVerbose(self: *const Options, writer: anytype) !void { @@ -358,6 +355,29 @@ pub const Arg = struct { }; } + pub fn looksLikeFilepath(self: Arg) bool { + const meets_min_requirements = self.prefix == .slash and isSupportedInputExtension(std.fs.path.extension(self.full)); + if (!meets_min_requirements) return false; + + const could_be_fo_option = could_be_fo_option: { + var window_it = std.mem.window(u8, self.full[1..], 2, 1); + while (window_it.next()) |window| { + if (std.ascii.eqlIgnoreCase(window, "fo")) break :could_be_fo_option true; + // If we see '/' before "fo", then it's not possible for this to be a valid + // `/fo` option. + if (window[0] == '/') break; + } + break :could_be_fo_option false; + }; + if (!could_be_fo_option) return true; + + // It's still possible for a file path to look like a /fo option but not actually + // be one, e.g. `/foo/bar.rc`. As a last ditch effort to reduce false negatives, + // check if the file path exists and, if so, then we ignore the 'could be /fo option'-ness + std.fs.accessAbsolute(self.full, .{}) catch return false; + return true; + } + pub const Value = struct { slice: []const u8, index_increment: u2 = 1, @@ -432,6 +452,16 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn } } + const args_remaining = args.len - arg_i; + if (args_remaining <= 2 and arg.looksLikeFilepath()) { + var err_details = Diagnostics.ErrorDetails{ .type = .note, .print_args = true, .arg_index = arg_i }; + var msg_writer = err_details.msg.writer(allocator); + try msg_writer.writeAll("this argument was inferred to be a filepath, so argument parsing was terminated"); + try diagnostics.append(err_details); + + break; + } + while (arg.name().len > 0) { const arg_name = arg.name(); // Note: These cases should be in order from longest to shortest, since @@ -440,24 +470,6 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn if (std.ascii.startsWithIgnoreCase(arg_name, ":no-preprocess")) { options.preprocess = .no; arg.name_offset += ":no-preprocess".len; - } else if (std.ascii.startsWithIgnoreCase(arg_name, ":mingw-includes")) { - const value = arg.value(":mingw-includes".len, arg_i, args) catch { - var err_details = Diagnostics.ErrorDetails{ .arg_index = arg_i, .arg_span = arg.missingSpan() }; - var msg_writer = err_details.msg.writer(allocator); - try msg_writer.print("missing value after {s}{s} option", .{ arg.prefixSlice(), arg.optionWithoutPrefix(":mingw-includes".len) }); - try diagnostics.append(err_details); - arg_i += 1; - break :next_arg; - }; - if (options.mingw_includes_dir) |overwritten_path| { - allocator.free(overwritten_path); - options.mingw_includes_dir = null; - } - const path = try allocator.dupe(u8, value.slice); - errdefer allocator.free(path); - options.mingw_includes_dir = path; - arg_i += value.index_increment; - continue :next_arg; } else if (std.ascii.startsWithIgnoreCase(arg_name, ":auto-includes")) { const value = arg.value(":auto-includes".len, arg_i, args) catch { var err_details = Diagnostics.ErrorDetails{ .arg_index = arg_i, .arg_span = arg.missingSpan() }; @@ -769,7 +781,7 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn arg_i += value.index_increment; continue :next_arg; }; - options.default_code_page = CodePage.getByIdentifierEnsureSupported(code_page_id) catch |err| switch (err) { + options.default_code_page = code_pages.getByIdentifierEnsureSupported(code_page_id) catch |err| switch (err) { error.InvalidCodePage => { var err_details = Diagnostics.ErrorDetails{ .arg_index = arg_i, .arg_span = value.argSpan(arg) }; var msg_writer = err_details.msg.writer(allocator); @@ -782,7 +794,7 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn var err_details = Diagnostics.ErrorDetails{ .arg_index = arg_i, .arg_span = value.argSpan(arg) }; var msg_writer = err_details.msg.writer(allocator); try msg_writer.print("unsupported code page: {s} (id={})", .{ - @tagName(CodePage.getByIdentifier(code_page_id) catch unreachable), + @tagName(code_pages.getByIdentifier(code_page_id) catch unreachable), code_page_id, }); try diagnostics.append(err_details); @@ -900,18 +912,20 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn const positionals = args[arg_i..]; - if (positionals.len < 1) { + if (positionals.len == 0) { var err_details = Diagnostics.ErrorDetails{ .print_args = false, .arg_index = arg_i }; var msg_writer = err_details.msg.writer(allocator); try msg_writer.writeAll("missing input filename"); try diagnostics.append(err_details); - const last_arg = args[args.len - 1]; - if (arg_i > 0 and last_arg.len > 0 and last_arg[0] == '/' and std.ascii.endsWithIgnoreCase(last_arg, ".rc")) { - var note_details = Diagnostics.ErrorDetails{ .type = .note, .print_args = true, .arg_index = arg_i - 1 }; - var note_writer = note_details.msg.writer(allocator); - try note_writer.writeAll("if this argument was intended to be the input filename, then -- should be specified in front of it to exclude it from option parsing"); - try diagnostics.append(note_details); + if (args.len > 0) { + const last_arg = args[args.len - 1]; + if (arg_i > 0 and last_arg.len > 0 and last_arg[0] == '/' and std.ascii.endsWithIgnoreCase(last_arg, ".rc")) { + var note_details = Diagnostics.ErrorDetails{ .type = .note, .print_args = true, .arg_index = arg_i - 1 }; + var note_writer = note_details.msg.writer(allocator); + try note_writer.writeAll("if this argument was intended to be the input filename, then -- should be specified in front of it to exclude it from option parsing"); + try diagnostics.append(note_details); + } } // This is a fatal enough problem to justify an early return, since @@ -969,6 +983,12 @@ pub fn parse(allocator: Allocator, args: []const []const u8, diagnostics: *Diagn return options; } +pub fn isSupportedInputExtension(ext: []const u8) bool { + if (std.ascii.eqlIgnoreCase(ext, ".rc")) return true; + if (std.ascii.eqlIgnoreCase(ext, ".rcpp")) return true; + return false; +} + /// Returns true if the str is a valid C identifier for use in a #define/#undef macro pub fn isValidIdentifier(str: []const u8) bool { for (str, 0..) |c, i| switch (c) { @@ -1271,6 +1291,43 @@ test "parse errors: basic" { ); } +test "inferred absolute filepaths" { + { + var options = try testParseWarning(&.{ "/fo", "foo.res", "/home/absolute/path.rc" }, + \\: note: this argument was inferred to be a filepath, so argument parsing was terminated + \\ ... /home/absolute/path.rc + \\ ^~~~~~~~~~~~~~~~~~~~~~ + \\ + ); + defer options.deinit(); + } + { + var options = try testParseWarning(&.{ "/home/absolute/path.rc", "foo.res" }, + \\: note: this argument was inferred to be a filepath, so argument parsing was terminated + \\ ... /home/absolute/path.rc ... + \\ ^~~~~~~~~~~~~~~~~~~~~~ + \\ + ); + defer options.deinit(); + } + { + // Only the last two arguments are checked, so the /h is parsed as an option + var options = try testParse(&.{ "/home/absolute/path.rc", "foo.rc", "foo.res" }); + defer options.deinit(); + + try std.testing.expect(options.print_help_and_exit); + } + { + var options = try testParse(&.{ "/xvFO/some/absolute/path.res", "foo.rc" }); + defer options.deinit(); + + try std.testing.expectEqual(true, options.verbose); + try std.testing.expectEqual(true, options.ignore_include_env_var); + try std.testing.expectEqualStrings("foo.rc", options.input_source.filename); + try std.testing.expectEqualStrings("/some/absolute/path.res", options.output_source.filename); + } +} + test "parse errors: /ln" { try testParseError(&.{ "/ln", "invalid", "foo.rc" }, \\: error: invalid language tag: invalid diff --git a/lib/compiler/resinator/code_pages.zig b/lib/compiler/resinator/code_pages.zig index 48131678114a..53cbdb768cd8 100644 --- a/lib/compiler/resinator/code_pages.zig +++ b/lib/compiler/resinator/code_pages.zig @@ -1,86 +1,30 @@ const std = @import("std"); const windows1252 = @import("windows1252.zig"); -// TODO: Parts of this comment block may be more relevant to string/NameOrOrdinal parsing -// than it is to the stuff in this file. -// -// ‰ representations for context: -// Win-1252 89 -// UTF-8 E2 80 B0 -// UTF-16 20 30 -// -// With code page 65001: -// ‰ RCDATA { "‰" L"‰" } -// File encoded as Windows-1252: -// ‰ => as u16 -// "‰" => 0x3F ('?') -// L"‰" => as u16 -// File encoded as UTF-8: -// ‰ => as u16 -// "‰" => 0x89 ('‰' encoded as Windows-1252) -// L"‰" => as u16 -// -// With code page 1252: -// ‰ RCDATA { "‰" L"‰" } -// File encoded as Windows-1252: -// ‰ => as u16 -// "‰" => 0x89 ('‰' encoded as Windows-1252) -// L"‰" => as u16 -// File encoded as UTF-8: -// ‰ => 0xE2 as u16, 0x20AC as u16, 0xB0 as u16 -// ^ first byte of utf8 representation -// ^ second byte of UTF-8 representation (0x80), but interpretted as -// Windows-1252 ('€') and then converted to UTF-16 () -// ^ third byte of utf8 representation -// "‰" => 0xE2, 0x80, 0xB0 (the bytes of the UTF-8 representation) -// L"‰" => 0xE2 as u16, 0x20AC as u16, 0xB0 as u16 (see '‰ =>' explanation) -// -// With code page 1252: -// <0x90> RCDATA { "<0x90>" L"<0x90>" } -// File encoded as Windows-1252: -// <0x90> => 0x90 as u16 -// "<0x90>" => 0x90 -// L"<0x90>" => 0x90 as u16 -// File encoded as UTF-8: -// <0x90> => 0xC2 as u16, 0x90 as u16 -// "<0x90>" => 0xC2, 0x90 (the bytes of the UTF-8 representation of ) -// L"<0x90>" => 0xC2 as u16, 0x90 as u16 -// -// Within a raw data block, file encoded as Windows-1252 ( is <0xC2>): -// "Âa" L"Âa" "\xC2ad" L"\xC2AD" -// With code page 1252: -// C2 61 C2 00 61 00 C2 61 64 AD C2 -// Â^ a^ Â~~~^ a~~~^ .^ a^ d^ ^~~~~\xC2AD -// \xC2~` -// With code page 65001: -// 3F 61 FD FF 61 00 C2 61 64 AD C2 -// ^. a^ ^~~~. a~~~^ ^. a^ d^ ^~~~~\xC2AD -// `. `. `~\xC2 -// `. `.~<0xC2>a is not well-formed UTF-8 (0xC2 expects a continutation byte after it). -// `. Because 'a' is a valid first byte of a UTF-8 sequence, it is not included in the -// `. invalid sequence so only the <0xC2> gets converted to . -// `~Same as ^ but converted to '?' instead. -// -// Within a raw data block, file encoded as Windows-1252 (ð is <0xF0>, € is <0x80>): -// "ð€a" L"ð€a" -// With code page 1252: -// F0 80 61 F0 00 AC 20 61 00 -// ð^ €^ a^ ð~~~^ €~~~^ a~~~^ -// With code page 65001: -// 3F 61 FD FF 61 00 -// ^. a^ ^~~~. a~~~^ -// `. `. -// `. `.~<0xF0><0x80> is not well-formed UTF-8, and <0x80> is not a valid first byte, so -// `. both bytes are considered an invalid sequence and get converted to '' -// `~Same as ^ but converted to '?' instead. - /// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers -pub const CodePage = enum(u16) { - // supported +pub const SupportedCodePage = enum(u16) { windows1252 = 1252, // windows-1252 ANSI Latin 1; Western European (Windows) utf8 = 65001, // utf-8 Unicode (UTF-8) - // unsupported but valid + pub fn codepointAt(code_page: SupportedCodePage, index: usize, bytes: []const u8) ?Codepoint { + if (index >= bytes.len) return null; + switch (code_page) { + .windows1252 => { + // All byte values have a representation, so just convert the byte + return Codepoint{ + .value = windows1252.toCodepoint(bytes[index]), + .byte_len = 1, + }; + }, + .utf8 => { + return Utf8.WellFormedDecoder.decode(bytes[index..]); + }, + } + } +}; + +/// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers +pub const UnsupportedCodePage = enum(u16) { ibm037 = 37, // IBM037 IBM EBCDIC US-Canada ibm437 = 437, // IBM437 OEM United States ibm500 = 500, // IBM500 IBM EBCDIC International @@ -231,50 +175,45 @@ pub const CodePage = enum(u16) { x_iscii_gu = 57010, // x-iscii-gu ISCII Gujarati x_iscii_pa = 57011, // x-iscii-pa ISCII Punjabi utf7 = 65000, // utf-7 Unicode (UTF-7) +}; - pub fn codepointAt(code_page: CodePage, index: usize, bytes: []const u8) ?Codepoint { - if (index >= bytes.len) return null; - switch (code_page) { - .windows1252 => { - // All byte values have a representation, so just convert the byte - return Codepoint{ - .value = windows1252.toCodepoint(bytes[index]), - .byte_len = 1, - }; - }, - .utf8 => { - return Utf8.WellFormedDecoder.decode(bytes[index..]); - }, - else => unreachable, - } - } - - pub fn isSupported(code_page: CodePage) bool { - return switch (code_page) { - .windows1252, .utf8 => true, - else => false, - }; - } +pub const CodePage = blk: { + const fields = @typeInfo(SupportedCodePage).@"enum".fields ++ @typeInfo(UnsupportedCodePage).@"enum".fields; + break :blk @Type(.{ .@"enum" = .{ + .tag_type = u16, + .decls = &.{}, + .fields = fields, + .is_exhaustive = true, + } }); +}; - pub fn getByIdentifier(identifier: u16) !CodePage { - // There's probably a more efficient way to do this (e.g. ComptimeHashMap?) but - // this should be fine, especially since this function likely won't be called much. - inline for (@typeInfo(CodePage).@"enum".fields) |enumField| { - if (identifier == enumField.value) { - return @field(CodePage, enumField.name); - } +pub fn isSupported(code_page: CodePage) bool { + inline for (@typeInfo(SupportedCodePage).@"enum".fields) |enumField| { + if (@intFromEnum(code_page) == @intFromEnum(@field(SupportedCodePage, enumField.name))) { + return true; } - return error.InvalidCodePage; } + return false; +} - pub fn getByIdentifierEnsureSupported(identifier: u16) !CodePage { - const code_page = try getByIdentifier(identifier); - switch (isSupported(code_page)) { - true => return code_page, - false => return error.UnsupportedCodePage, +pub fn getByIdentifier(identifier: u16) !CodePage { + // There's probably a more efficient way to do this (e.g. ComptimeHashMap?) but + // this should be fine, especially since this function likely won't be called much. + inline for (@typeInfo(CodePage).@"enum".fields) |enumField| { + if (identifier == enumField.value) { + return @field(CodePage, enumField.name); } } -}; + return error.InvalidCodePage; +} + +pub fn getByIdentifierEnsureSupported(identifier: u16) !SupportedCodePage { + const code_page = try getByIdentifier(identifier); + return if (isSupported(code_page)) + @enumFromInt(@intFromEnum(code_page)) + else + error.UnsupportedCodePage; +} pub const Utf8 = struct { /// Implements decoding with rejection of ill-formed UTF-8 sequences based on section @@ -378,20 +317,20 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 2, - }, CodePage.utf8.codepointAt(1, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(3, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(3, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(4, invalid_utf8).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(5, invalid_utf8)); + }, SupportedCodePage.utf8.codepointAt(4, invalid_utf8).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(5, invalid_utf8)); } { @@ -399,12 +338,12 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 2, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(2, invalid_utf8).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(3, invalid_utf8)); + }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(3, invalid_utf8)); } { @@ -412,8 +351,8 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(1, invalid_utf8)); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(1, invalid_utf8)); } { @@ -421,8 +360,8 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 2, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(2, invalid_utf8)); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8)); } { @@ -430,12 +369,12 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(1, invalid_utf8).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(2, invalid_utf8)); + }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8)); } { @@ -444,11 +383,11 @@ test "codepointAt invalid utf8" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 2, - }, CodePage.utf8.codepointAt(0, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?); try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(2, invalid_utf8).?); + }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?); } } @@ -459,19 +398,19 @@ test "codepointAt utf8 encoded" { try std.testing.expectEqual(Codepoint{ .value = '²', .byte_len = 2, - }, CodePage.utf8.codepointAt(0, utf8_encoded).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.utf8.codepointAt(2, utf8_encoded)); + }, SupportedCodePage.utf8.codepointAt(0, utf8_encoded).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, utf8_encoded)); // with code page windows1252 try std.testing.expectEqual(Codepoint{ .value = '\xC2', .byte_len = 1, - }, CodePage.windows1252.codepointAt(0, utf8_encoded).?); + }, SupportedCodePage.windows1252.codepointAt(0, utf8_encoded).?); try std.testing.expectEqual(Codepoint{ .value = '\xB2', .byte_len = 1, - }, CodePage.windows1252.codepointAt(1, utf8_encoded).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(2, utf8_encoded)); + }, SupportedCodePage.windows1252.codepointAt(1, utf8_encoded).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(2, utf8_encoded)); } test "codepointAt windows1252 encoded" { @@ -481,15 +420,15 @@ test "codepointAt windows1252 encoded" { try std.testing.expectEqual(Codepoint{ .value = Codepoint.invalid, .byte_len = 1, - }, CodePage.utf8.codepointAt(0, windows1252_encoded).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.utf8.codepointAt(2, windows1252_encoded)); + }, SupportedCodePage.utf8.codepointAt(0, windows1252_encoded).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, windows1252_encoded)); // with code page windows1252 try std.testing.expectEqual(Codepoint{ .value = '\xB2', .byte_len = 1, - }, CodePage.windows1252.codepointAt(0, windows1252_encoded).?); - try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(1, windows1252_encoded)); + }, SupportedCodePage.windows1252.codepointAt(0, windows1252_encoded).?); + try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(1, windows1252_encoded)); } pub const Codepoint = struct { diff --git a/lib/compiler/resinator/comments.zig b/lib/compiler/resinator/comments.zig index 67504bbbeb28..86c482ef8162 100644 --- a/lib/compiler/resinator/comments.zig +++ b/lib/compiler/resinator/comments.zig @@ -174,6 +174,21 @@ pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMa }, }, } + } else { + switch (state) { + .start, + .line_comment, + .multiline_comment, + .multiline_comment_end, + .single_quoted, + .single_quoted_escape, + .double_quoted, + .double_quoted_escape, + => {}, + .forward_slash => { + result.writeSlice(source[pending_start.?..index]); + }, + } } return result.getWritten(); } @@ -334,6 +349,16 @@ test "comments appended to a line" { ); } +test "forward slash only" { + try testRemoveComments( + \\ / + \\/ + , + \\ / + \\/ + ); +} + test "remove comments with mappings" { const allocator = std.testing.allocator; var mut_source = "blah/*\rcommented line*\r/blah".*; diff --git a/lib/compiler/resinator/compile.zig b/lib/compiler/resinator/compile.zig index 58259cf4c47a..ab8cb73dcc7c 100644 --- a/lib/compiler/resinator/compile.zig +++ b/lib/compiler/resinator/compile.zig @@ -4,7 +4,7 @@ const Allocator = std.mem.Allocator; const Node = @import("ast.zig").Node; const lex = @import("lex.zig"); const Parser = @import("parse.zig").Parser; -const Resource = @import("rc.zig").Resource; +const ResourceType = @import("rc.zig").ResourceType; const Token = @import("lex.zig").Token; const literals = @import("literals.zig"); const Number = literals.Number; @@ -21,7 +21,7 @@ const WORD = std.os.windows.WORD; const DWORD = std.os.windows.DWORD; const utils = @import("utils.zig"); const NameOrOrdinal = res.NameOrOrdinal; -const CodePage = @import("code_pages.zig").CodePage; +const SupportedCodePage = @import("code_pages.zig").SupportedCodePage; const CodePageLookup = @import("ast.zig").CodePageLookup; const SourceMappings = @import("source_mapping.zig").SourceMappings; const windows1252 = @import("windows1252.zig"); @@ -39,7 +39,10 @@ pub const CompileOptions = struct { /// freed by the caller. /// TODO: Maybe a dedicated struct for this purpose so that it's a bit nicer to work with. dependencies_list: ?*std.ArrayList([]const u8) = null, - default_code_page: CodePage = .windows1252, + default_code_page: SupportedCodePage = .windows1252, + /// If true, the first #pragma code_page directive only sets the input code page, but not the output code page. + /// This check must be done before comments are removed from the file. + disjoint_code_page: bool = false, ignore_include_env_var: bool = false, extra_include_paths: []const []const u8 = &.{}, /// This is just an API convenience to allow separately passing 'system' (i.e. those @@ -66,6 +69,7 @@ pub fn compile(allocator: Allocator, source: []const u8, writer: anytype, option }); var parser = Parser.init(&lexer, .{ .warn_instead_of_error_on_invalid_code_page = options.warn_instead_of_error_on_invalid_code_page, + .disjoint_code_page = options.disjoint_code_page, }); var tree = try parser.parse(allocator, options.diagnostics); defer tree.deinit(); @@ -98,6 +102,7 @@ pub fn compile(allocator: Allocator, source: []const u8, writer: anytype, option .end = 0, .line_number = 1, }, + .code_page = .utf8, .print_source_line = false, .extra = .{ .file_open_error = .{ .err = ErrorDetails.FileOpenError.enumFromError(err), @@ -213,7 +218,12 @@ pub const Compiler = struct { try self.addErrorDetails(.{ .err = .result_contains_fontdir, .type = .hint, - .token = undefined, + .token = .{ + .id = .invalid, + .start = 0, + .end = 0, + .line_number = 1, + }, }); } // once we've written every else out, we can write out the finalized STRINGTABLE resources @@ -301,7 +311,10 @@ pub const Compiler = struct { // UTF-8, we can parse either string type directly to UTF-8. var parser = literals.IterativeStringParser.init(bytes, .{ .start_column = column, - .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal_node.token }, + .diagnostics = self.errContext(literal_node.token), + // TODO: Re-evaluate this. It's not been tested whether or not using the actual + // output code page would make more sense. + .output_code_page = .windows1252, }); while (try parser.nextUnchecked()) |parsed| { @@ -401,56 +414,55 @@ pub const Compiler = struct { return first_error orelse error.FileNotFound; } + /// Returns a Windows-1252 encoded string regardless of the current output code page. + /// All codepoints are encoded as a maximum of 2 bytes, where unescaped codepoints + /// >= 0x10000 are encoded as `??` and everything else is encoded as 1 byte. pub fn parseDlgIncludeString(self: *Compiler, token: Token) ![]u8 { - // For the purposes of parsing, we want to strip the L prefix - // if it exists since we want escaped integers to be limited to - // their ascii string range. - // - // We keep track of whether or not there was an L prefix, though, - // since there's more weirdness to come. - var bytes = self.sourceBytesForToken(token); - var was_wide_string = false; - if (bytes.slice[0] == 'L' or bytes.slice[0] == 'l') { - was_wide_string = true; - bytes.slice = bytes.slice[1..]; - } + const bytes = self.sourceBytesForToken(token); + const output_code_page = self.output_code_pages.getForToken(token); var buf = try std.ArrayList(u8).initCapacity(self.allocator, bytes.slice.len); errdefer buf.deinit(); var iterative_parser = literals.IterativeStringParser.init(bytes, .{ .start_column = token.calculateColumn(self.source, 8, null), - .diagnostics = .{ .diagnostics = self.diagnostics, .token = token }, + .diagnostics = self.errContext(token), + // TODO: Potentially re-evaluate this, it's not been tested whether or not + // using the actual output code page would make more sense. + .output_code_page = .windows1252, }); - // No real idea what's going on here, but this matches the rc.exe behavior + // This is similar to the logic in parseQuotedString, but ends up with everything + // encoded as Windows-1252. This effectively consolidates the two-step process + // of rc.exe into one step, since rc.exe's preprocessor converts to UTF-16 (this + // is when invalid sequences are replaced by the replacement character (U+FFFD)), + // and then that's run through the parser. Our preprocessor keeps things in their + // original encoding, meaning we emulate the -> UTF-16 -> Windows-1252 + // results all at once. while (try iterative_parser.next()) |parsed| { const c = parsed.codepoint; - switch (was_wide_string) { - true => { - switch (c) { - 0...0x7F, 0xA0...0xFF => try buf.append(@intCast(c)), - 0x80...0x9F => { - if (windows1252.bestFitFromCodepoint(c)) |_| { - try buf.append(@intCast(c)); - } else { - try buf.append('?'); - } - }, - else => { - if (windows1252.bestFitFromCodepoint(c)) |best_fit| { - try buf.append(best_fit); - } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { - try buf.append('?'); - } else { - try buf.appendSlice("??"); - } - }, + switch (iterative_parser.declared_string_type) { + .wide => { + if (windows1252.bestFitFromCodepoint(c)) |best_fit| { + try buf.append(best_fit); + } else if (c < 0x10000 or c == code_pages.Codepoint.invalid or parsed.escaped_surrogate_pair) { + try buf.append('?'); + } else { + try buf.appendSlice("??"); } }, - false => { + .ascii => { if (parsed.from_escaped_integer) { - try buf.append(@truncate(c)); + const truncated: u8 = @truncate(c); + switch (output_code_page) { + .utf8 => switch (truncated) { + 0...0x7F => try buf.append(truncated), + else => try buf.append('?'), + }, + .windows1252 => { + try buf.append(truncated); + }, + } } else { if (windows1252.bestFitFromCodepoint(c)) |best_fit| { try buf.append(best_fit); @@ -484,8 +496,12 @@ pub const Compiler = struct { const parsed_filename_terminated = std.mem.sliceTo(parsed_filename, 0); header.applyMemoryFlags(node.common_resource_attributes, self.source); + // This is effectively limited by `max_string_literal_codepoints` which is a u15. + // Each codepoint within a DLGINCLUDE string is encoded as a maximum of + // 2 bytes, which means that the maximum byte length of a DLGINCLUDE string is + // (including the NUL terminator): 32,767 * 2 + 1 = 65,535 or exactly the u16 max. header.data_size = @intCast(parsed_filename_terminated.len + 1); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); try writer.writeAll(parsed_filename_terminated); try writer.writeByte(0); try writeDataPadding(writer, header.data_size); @@ -568,7 +584,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); header.data_size = @intCast(try file.getEndPos()); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); try file.seekTo(0); try writeResourceData(writer, file.reader(), header.data_size); return; @@ -644,7 +660,7 @@ pub const Compiler = struct { .version = self.state.version, .characteristics = self.state.characteristics, }; - try image_header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try image_header.write(writer, self.errContext(node.id)); // From https://learn.microsoft.com/en-us/windows/win32/menurc/localheader: // > The LOCALHEADER structure is the first data written to the RT_CURSOR @@ -817,12 +833,26 @@ pub const Compiler = struct { header.data_size = icon_dir.getResDataSize(); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); try icon_dir.writeResData(writer, first_icon_id); try writeDataPadding(writer, header.data_size); return; }, - .RCDATA, .HTML, .MANIFEST, .MESSAGETABLE, .DLGINIT, .PLUGPLAY => { + .RCDATA, + .HTML, + .MESSAGETABLE, + .DLGINIT, + .PLUGPLAY, + .VXD, + // Note: All of the below can only be specified by using a number + // as the resource type. + .MANIFEST, + .CURSOR, + .ICON, + .ANICURSOR, + .ANIICON, + .FONTDIR, + => { header.applyMemoryFlags(node.common_resource_attributes, self.source); }, .BITMAP => { @@ -855,47 +885,32 @@ pub const Compiler = struct { } else if (bitmap_info.getActualPaletteByteLen() < bitmap_info.getExpectedPaletteByteLen()) { const num_padding_bytes = bitmap_info.getExpectedPaletteByteLen() - bitmap_info.getActualPaletteByteLen(); - // TODO: Make this configurable (command line option) - const max_missing_bytes = 4096; - if (num_padding_bytes > max_missing_bytes) { - var numbers_as_bytes: [16]u8 = undefined; - std.mem.writeInt(u64, numbers_as_bytes[0..8], num_padding_bytes, native_endian); - std.mem.writeInt(u64, numbers_as_bytes[8..16], max_missing_bytes, native_endian); - const values_string_index = try self.diagnostics.putString(&numbers_as_bytes); - try self.addErrorDetails(.{ - .err = .bmp_too_many_missing_palette_bytes, - .token = filename_token, - .extra = .{ .number = values_string_index }, - }); - return self.addErrorDetailsAndFail(.{ - .err = .bmp_too_many_missing_palette_bytes, - .type = .note, - .print_source_line = false, - .token = filename_token, - }); - } - var number_as_bytes: [8]u8 = undefined; std.mem.writeInt(u64, &number_as_bytes, num_padding_bytes, native_endian); const value_string_index = try self.diagnostics.putString(&number_as_bytes); try self.addErrorDetails(.{ .err = .bmp_missing_palette_bytes, - .type = .warning, + .type = .err, .token = filename_token, .extra = .{ .number = value_string_index }, }); const pixel_data_len = bitmap_info.getPixelDataLen(file_size); + // TODO: This is a hack, but we know we have already added + // at least one entry to the diagnostics strings, so we can + // get away with using 0 to mean 'no string' here. + var miscompiled_bytes_string_index: u32 = 0; if (pixel_data_len > 0) { const miscompiled_bytes = @min(pixel_data_len, num_padding_bytes); std.mem.writeInt(u64, &number_as_bytes, miscompiled_bytes, native_endian); - const miscompiled_bytes_string_index = try self.diagnostics.putString(&number_as_bytes); - try self.addErrorDetails(.{ - .err = .rc_would_miscompile_bmp_palette_padding, - .type = .warning, - .token = filename_token, - .extra = .{ .number = miscompiled_bytes_string_index }, - }); + miscompiled_bytes_string_index = try self.diagnostics.putString(&number_as_bytes); } + return self.addErrorDetailsAndFail(.{ + .err = .rc_would_miscompile_bmp_palette_padding, + .type = .note, + .print_source_line = false, + .token = filename_token, + .extra = .{ .number = miscompiled_bytes_string_index }, + }); } // TODO: It might be possible that the calculation done in this function @@ -905,7 +920,7 @@ pub const Compiler = struct { const bmp_bytes_to_write: u32 = @intCast(bitmap_info.getExpectedByteLen(file_size)); header.data_size = bmp_bytes_to_write; - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); try file.seekTo(bmp.file_header_len); const file_reader = file.reader(); try writeResourceDataNoPadding(writer, file_reader, bitmap_info.dib_header_size); @@ -914,12 +929,6 @@ pub const Compiler = struct { } if (bitmap_info.getExpectedPaletteByteLen() > 0) { try writeResourceDataNoPadding(writer, file_reader, @intCast(bitmap_info.getActualPaletteByteLen())); - // We know that the number of missing palette bytes is <= 4096 - // (see `bmp_too_many_missing_palette_bytes` error case above) - const padding_bytes: usize = @intCast(bitmap_info.getMissingPaletteByteLen()); - if (padding_bytes > 0) { - try writer.writeByteNTimes(0, padding_bytes); - } } try file.seekTo(bitmap_info.pixel_data_offset); const pixel_bytes: u32 = @intCast(file_size - bitmap_info.pixel_data_offset); @@ -932,13 +941,13 @@ pub const Compiler = struct { // Add warning and skip this resource // Note: The Win32 compiler prints this as an error but it doesn't fail the compilation // and the duplicate resource is skipped. - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .font_id_already_defined, .token = node.id, .type = .warning, .extra = .{ .number = header.name_value.ordinal }, }); - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .font_id_already_defined, .token = self.state.font_dir.ids.get(header.name_value.ordinal).?, .type = .note, @@ -957,7 +966,7 @@ pub const Compiler = struct { // We now know that the data size will fit in a u32 header.data_size = @intCast(file_size); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var header_slurping_reader = headerSlurpingReader(148, file.reader()); try writeResourceData(writer, header_slurping_reader.reader(), header.data_size); @@ -968,19 +977,13 @@ pub const Compiler = struct { }, node.id); return; }, - .ACCELERATOR, - .ANICURSOR, - .ANIICON, - .CURSOR, - .DIALOG, - .DLGINCLUDE, - .FONTDIR, - .ICON, - .MENU, - .STRING, - .TOOLBAR, - .VERSION, - .VXD, + .ACCELERATOR, // Cannot use an external file, enforced by the parser + .DIALOG, // Cannot use an external file, enforced by the parser + .DLGINCLUDE, // Handled specially above + .MENU, // Cannot use an external file, enforced by the parser + .STRING, // Parser error if this resource is specified as a number + .TOOLBAR, // Cannot use an external file, enforced by the parser + .VERSION, // Cannot use an external file, enforced by the parser => unreachable, _ => unreachable, } @@ -998,7 +1001,7 @@ pub const Compiler = struct { } // We now know that the data size will fit in a u32 header.data_size = @intCast(data_size); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); try writeResourceData(writer, file.reader(), header.data_size); } @@ -1188,7 +1191,7 @@ pub const Compiler = struct { }; const parsed = try literals.parseQuotedAsciiString(self.allocator, bytes, .{ .start_column = column, - .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal_node.token }, + .diagnostics = self.errContext(literal_node.token), .output_code_page = self.output_code_pages.getForToken(literal_node.token), }); errdefer self.allocator.free(parsed); @@ -1202,7 +1205,8 @@ pub const Compiler = struct { }; const parsed_string = try literals.parseQuotedWideString(self.allocator, bytes, .{ .start_column = column, - .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal_node.token }, + .diagnostics = self.errContext(literal_node.token), + .output_code_page = self.output_code_pages.getForToken(literal_node.token), }); errdefer self.allocator.free(parsed_string); return .{ .wide_string = parsed_string }; @@ -1259,7 +1263,7 @@ pub const Compiler = struct { header.applyMemoryFlags(common_resource_attributes, self.source); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = id_token }); + try header.write(writer, self.errContext(id_token)); } pub fn writeResourceDataNoPadding(writer: anytype, data_reader: anytype, data_size: u32) !void { @@ -1297,7 +1301,8 @@ pub const Compiler = struct { const column = literal.token.calculateColumn(self.source, 8, null); return res.parseAcceleratorKeyString(bytes, is_virt, .{ .start_column = column, - .diagnostics = .{ .diagnostics = self.diagnostics, .token = literal.token }, + .diagnostics = self.errContext(literal.token), + .output_code_page = self.output_code_pages.getForToken(literal.token), }); } } @@ -1332,7 +1337,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); header.applyOptionalStatements(node.optional_statements, self.source, self.input_code_pages); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var data_fbs = std.io.fixedBufferStream(data_buffer.items); try writeResourceData(writer, data_fbs.reader(), data_size); @@ -1348,6 +1353,16 @@ pub const Compiler = struct { const modifier = rc.AcceleratorTypeAndOptions.map.get(type_or_option.slice(self.source)).?; modifiers.apply(modifier); } + if ((modifiers.isSet(.control) or modifiers.isSet(.shift)) and !modifiers.isSet(.virtkey)) { + try self.addErrorDetails(.{ + .err = .accelerator_shift_or_control_without_virtkey, + .type = .warning, + // We know that one of SHIFT or CONTROL was specified, so there's at least one item + // in this list. + .token = accelerator.type_and_options[0], + .token_span_end = accelerator.type_and_options[accelerator.type_and_options.len - 1], + }); + } if (accelerator.event.isNumberExpression() and !modifiers.explicit_ascii_or_virtkey) { return self.addErrorDetailsAndFail(.{ .err = .accelerator_type_required, @@ -1399,7 +1414,7 @@ pub const Compiler = struct { var limited_writer = limitedWriter(data_buffer.writer(), std.math.maxInt(u32)); const data_writer = limited_writer.writer(); - const resource = Resource.fromString(.{ + const resource = ResourceType.fromString(.{ .slice = node.type.slice(self.source), .code_page = self.input_code_pages.getForToken(node.type), }); @@ -1414,8 +1429,6 @@ pub const Compiler = struct { menu.deinit(self.allocator); } } - var skipped_menu_or_classes = std.ArrayList(*Node.SimpleStatement).init(self.allocator); - defer skipped_menu_or_classes.deinit(); var last_menu: *Node.SimpleStatement = undefined; var last_class: *Node.SimpleStatement = undefined; var last_menu_would_be_forced_ordinal = false; @@ -1445,9 +1458,6 @@ pub const Compiler = struct { }, .class => { const is_duplicate = optional_statement_values.class != null; - if (is_duplicate) { - try skipped_menu_or_classes.append(last_class); - } const forced_ordinal = is_duplicate and optional_statement_values.class.? == .ordinal; // In the Win32 RC compiler, if any CLASS values that are interpreted as // an ordinal exist, it affects all future CLASS statements and forces @@ -1475,9 +1485,6 @@ pub const Compiler = struct { }, .menu => { const is_duplicate = optional_statement_values.menu != null; - if (is_duplicate) { - try skipped_menu_or_classes.append(last_menu); - } const forced_ordinal = is_duplicate and optional_statement_values.menu.? == .ordinal; // In the Win32 RC compiler, if any MENU values that are interpreted as // an ordinal exist, it affects all future MENU statements and forces @@ -1561,22 +1568,6 @@ pub const Compiler = struct { } } - for (skipped_menu_or_classes.items) |simple_statement| { - const statement_identifier = simple_statement.identifier; - const statement_type = rc.OptionalStatements.dialog_map.get(statement_identifier.slice(self.source)) orelse continue; - try self.addErrorDetails(.{ - .err = .duplicate_menu_or_class_skipped, - .type = .warning, - .token = simple_statement.identifier, - .token_span_start = simple_statement.base.getFirstToken(), - .token_span_end = simple_statement.base.getLastToken(), - .extra = .{ .menu_or_class = switch (statement_type) { - .menu => .menu, - .class => .class, - else => unreachable, - } }, - }); - } // The Win32 RC compiler miscompiles the value in the following scenario: // Multiple CLASS parameters are specified and any of them are treated as a number, then // the last CLASS is always treated as a number no matter what @@ -1739,7 +1730,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); header.applyOptionalStatements(node.optional_statements, self.source, self.input_code_pages); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var data_fbs = std.io.fixedBufferStream(data_buffer.items); try writeResourceData(writer, data_fbs.reader(), data_size); @@ -1749,7 +1740,7 @@ pub const Compiler = struct { self: *Compiler, node: *Node.Dialog, data_writer: anytype, - resource: Resource, + resource: ResourceType, optional_statement_values: *const DialogOptionalStatementValues, x: Number, y: Number, @@ -1809,7 +1800,7 @@ pub const Compiler = struct { self: *Compiler, control: *Node.ControlStatement, data_writer: anytype, - resource: Resource, + resource: ResourceType, bytes_written_so_far: u32, controls_by_id: *std.AutoHashMap(u32, *const Node.ControlStatement), ) !void { @@ -2053,7 +2044,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var data_fbs = std.io.fixedBufferStream(data_buffer.items); try writeResourceData(writer, data_fbs.reader(), data_size); @@ -2067,7 +2058,7 @@ pub const Compiler = struct { node: *Node.FontStatement, }; - pub fn writeDialogFont(self: *Compiler, resource: Resource, values: FontStatementValues, writer: anytype) !void { + pub fn writeDialogFont(self: *Compiler, resource: ResourceType, values: FontStatementValues, writer: anytype) !void { const node = values.node; const point_size = evaluateNumberExpression(node.point_size, self.source, self.input_code_pages); try writer.writeInt(u16, point_size.asWord(), .little); @@ -2104,7 +2095,7 @@ pub const Compiler = struct { .slice = node.type.slice(self.source), .code_page = self.input_code_pages.getForToken(node.type), }; - const resource = Resource.fromString(type_bytes); + const resource = ResourceType.fromString(type_bytes); std.debug.assert(resource == .menu or resource == .menuex); self.writeMenuData(node, data_writer, resource) catch |err| switch (err) { @@ -2128,7 +2119,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); header.applyOptionalStatements(node.optional_statements, self.source, self.input_code_pages); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var data_fbs = std.io.fixedBufferStream(data_buffer.items); try writeResourceData(writer, data_fbs.reader(), data_size); @@ -2136,7 +2127,7 @@ pub const Compiler = struct { /// Expects `data_writer` to be a LimitedWriter limited to u32, meaning all writes to /// the writer within this function could return error.NoSpaceLeft - pub fn writeMenuData(self: *Compiler, node: *Node.Menu, data_writer: anytype, resource: Resource) !void { + pub fn writeMenuData(self: *Compiler, node: *Node.Menu, data_writer: anytype, resource: ResourceType) !void { // menu header const version: u16 = if (resource == .menu) 0 else 1; try data_writer.writeInt(u16, version, .little); @@ -2393,7 +2384,7 @@ pub const Compiler = struct { header.applyMemoryFlags(node.common_resource_attributes, self.source); - try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id }); + try header.write(writer, self.errContext(node.id)); var data_fbs = std.io.fixedBufferStream(data_buffer.items); try writeResourceData(writer, data_fbs.reader(), data_size); @@ -2525,14 +2516,14 @@ pub const Compiler = struct { // It might be nice to have these errors point to the ids rather than the // string tokens, but that would mean storing the id token of each string // which doesn't seem worth it just for slightly better error messages. - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .string_already_defined, .token = string.string, .extra = .{ .string_and_language = .{ .id = string_id, .language = language } }, }); const existing_def_table = self.state.string_tables.tables.getPtr(language).?; const existing_definition = existing_def_table.get(string_id).?; - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .string_already_defined, .type = .note, .token = existing_definition, @@ -2628,7 +2619,7 @@ pub const Compiler = struct { pub fn init(allocator: Allocator, id_bytes: SourceBytes, type_bytes: SourceBytes, data_size: DWORD, language: res.Language, version: DWORD, characteristics: DWORD) InitError!ResourceHeader { const type_value = type: { - const resource_type = Resource.fromString(type_bytes); + const resource_type = ResourceType.fromString(type_bytes); if (res.RT.fromResource(resource_type)) |rt_constant| { break :type NameOrOrdinal{ .ordinal = @intFromEnum(rt_constant) }; } else { @@ -2673,7 +2664,7 @@ pub const Compiler = struct { padding_after_name: u2, }; - fn calcSize(self: ResourceHeader) error{Overflow}!SizeInfo { + pub fn calcSize(self: ResourceHeader) error{Overflow}!SizeInfo { var header_size: u32 = 8; header_size = try std.math.add( u32, @@ -2699,6 +2690,7 @@ pub const Compiler = struct { const size_info = self.calcSize() catch { try err_ctx.diagnostics.append(.{ .err = .resource_data_size_exceeds_max, + .code_page = err_ctx.code_page, .token = err_ctx.token, }); return error.CompileError; @@ -2706,7 +2698,7 @@ pub const Compiler = struct { return self.writeSizeInfo(writer, size_info); } - fn writeSizeInfo(self: ResourceHeader, writer: anytype, size_info: SizeInfo) !void { + pub fn writeSizeInfo(self: ResourceHeader, writer: anytype, size_info: SizeInfo) !void { try writer.writeInt(DWORD, self.data_size, .little); // DataSize try writer.writeInt(DWORD, size_info.bytes, .little); // HeaderSize try self.type_value.write(writer); // TYPE @@ -2863,19 +2855,44 @@ pub const Compiler = struct { self.sourceBytesForToken(token), .{ .start_column = token.calculateColumn(self.source, 8, null), - .diagnostics = .{ .diagnostics = self.diagnostics, .token = token }, + .diagnostics = self.errContext(token), + .output_code_page = self.output_code_pages.getForToken(token), }, ); } - fn addErrorDetails(self: *Compiler, details: ErrorDetails) Allocator.Error!void { + fn addErrorDetailsWithCodePage(self: *Compiler, details: ErrorDetails) Allocator.Error!void { try self.diagnostics.append(details); } - fn addErrorDetailsAndFail(self: *Compiler, details: ErrorDetails) error{ CompileError, OutOfMemory } { - try self.addErrorDetails(details); + /// Code page is looked up in input_code_pages using the token + fn addErrorDetails(self: *Compiler, details_without_code_page: errors.ErrorDetailsWithoutCodePage) Allocator.Error!void { + const details = ErrorDetails{ + .err = details_without_code_page.err, + .code_page = self.input_code_pages.getForToken(details_without_code_page.token), + .token = details_without_code_page.token, + .token_span_start = details_without_code_page.token_span_start, + .token_span_end = details_without_code_page.token_span_end, + .type = details_without_code_page.type, + .print_source_line = details_without_code_page.print_source_line, + .extra = details_without_code_page.extra, + }; + try self.addErrorDetailsWithCodePage(details); + } + + /// Code page is looked up in input_code_pages using the token + fn addErrorDetailsAndFail(self: *Compiler, details_without_code_page: errors.ErrorDetailsWithoutCodePage) error{ CompileError, OutOfMemory } { + try self.addErrorDetails(details_without_code_page); return error.CompileError; } + + fn errContext(self: *Compiler, token: Token) errors.DiagnosticsContext { + return .{ + .diagnostics = self.diagnostics, + .token = token, + .code_page = self.input_code_pages.getForToken(token), + }; + } }; pub const OpenSearchPathError = std.fs.Dir.OpenError; @@ -3247,7 +3264,8 @@ pub const StringTable = struct { const bytes = SourceBytes{ .slice = slice, .code_page = code_page }; const utf16_string = try literals.parseQuotedStringAsWideString(compiler.allocator, bytes, .{ .start_column = column, - .diagnostics = .{ .diagnostics = compiler.diagnostics, .token = string_token }, + .diagnostics = compiler.errContext(string_token), + .output_code_page = compiler.output_code_pages.getForToken(string_token), }); defer compiler.allocator.free(utf16_string); diff --git a/lib/compiler/resinator/disjoint_code_page.zig b/lib/compiler/resinator/disjoint_code_page.zig new file mode 100644 index 000000000000..8aacbd490a81 --- /dev/null +++ b/lib/compiler/resinator/disjoint_code_page.zig @@ -0,0 +1,99 @@ +const std = @import("std"); +const lex = @import("lex.zig"); +const SourceMappings = @import("source_mapping.zig").SourceMappings; +const SupportedCodePage = @import("code_pages.zig").SupportedCodePage; + +pub fn hasDisjointCodePage(source: []const u8, source_mappings: ?*const SourceMappings, default_code_page: SupportedCodePage) bool { + var line_handler = lex.LineHandler{ .buffer = source }; + var i: usize = 0; + while (i < source.len) { + const codepoint = default_code_page.codepointAt(i, source) orelse break; + const c = codepoint.value; + switch (c) { + '\r', '\n' => { + _ = line_handler.incrementLineNumber(i); + // Any lines that are not from the root file interrupt the disjoint code page + if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) return false; + }, + // whitespace is ignored + ' ', + '\t', + // NBSP, this should technically be in the TODO below, but it is treated as whitespace + // due to a (misguided) special casing in the lexer, see the TODO in lex.zig + '\u{A0}', + => {}, + + // TODO: All of the below are treated as whitespace by the Win32 RC preprocessor, which also + // means they are trimmed from the file during preprocessing. This means that these characters + // should be treated like ' ', '\t' above, but since the resinator preprocessor does not treat + // them as whitespace *or* trim whitespace, files with these characters are likely going to + // error. So, in the future some sort of emulation of/rejection of the Win32 behavior might + // make handling these codepoints specially make sense, but for now it doesn't really matter + // so they are not handled specially for simplicity's sake. + //'\u{1680}', + //'\u{180E}', + //'\u{2001}', + //'\u{2002}', + //'\u{2003}', + //'\u{2004}', + //'\u{2005}', + //'\u{2006}', + //'\u{2007}', + //'\u{2008}', + //'\u{2009}', + //'\u{200A}', + //'\u{2028}', + //'\u{2029}', + //'\u{202F}', + //'\u{205F}', + //'\u{3000}', + + '#' => { + if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) { + return false; + } + const start_i = i; + while (i < source.len and source[i] != '\r' and source[i] != '\n') : (i += 1) {} + const line = source[start_i..i]; + _ = (lex.parsePragmaCodePage(line) catch |err| switch (err) { + error.NotPragma => return false, + error.NotCodePagePragma => continue, + error.CodePagePragmaUnsupportedCodePage => continue, + else => continue, + }) orelse return false; // DEFAULT interrupts disjoint code page + + // If we got a code page, then it is a disjoint code page pragma + return true; + }, + else => { + // Any other character interrupts the disjoint code page + return false; + }, + } + + i += codepoint.byte_len; + } + return false; +} + +test hasDisjointCodePage { + try std.testing.expect(hasDisjointCodePage("#pragma code_page(65001)\n", null, .windows1252)); + // NBSP is a special case + try std.testing.expect(hasDisjointCodePage("\xA0\n#pragma code_page(65001)\n", null, .windows1252)); + try std.testing.expect(hasDisjointCodePage("\u{A0}\n#pragma code_page(1252)\n", null, .utf8)); + // other preprocessor commands don't interrupt + try std.testing.expect(hasDisjointCodePage("#pragma foo\n#pragma code_page(65001)\n", null, .windows1252)); + // invalid code page doesn't interrupt + try std.testing.expect(hasDisjointCodePage("#pragma code_page(1234567)\n#pragma code_page(65001)\n", null, .windows1252)); + + try std.testing.expect(!hasDisjointCodePage("#if 1\n#endif\n#pragma code_page(65001)", null, .windows1252)); + try std.testing.expect(!hasDisjointCodePage("// comment\n#pragma code_page(65001)", null, .windows1252)); + try std.testing.expect(!hasDisjointCodePage("/* comment */\n#pragma code_page(65001)", null, .windows1252)); +} + +test "multiline comment edge case" { + // TODO + if (true) return error.SkipZigTest; + + try std.testing.expect(hasDisjointCodePage("/* comment */#pragma code_page(65001)", null, .windows1252)); +} diff --git a/lib/compiler/resinator/errors.zig b/lib/compiler/resinator/errors.zig index 67a5a09d3ba0..9727872367d8 100644 --- a/lib/compiler/resinator/errors.zig +++ b/lib/compiler/resinator/errors.zig @@ -8,7 +8,8 @@ const ico = @import("ico.zig"); const bmp = @import("bmp.zig"); const parse = @import("parse.zig"); const lang = @import("lang.zig"); -const CodePage = @import("code_pages.zig").CodePage; +const code_pages = @import("code_pages.zig"); +const SupportedCodePage = code_pages.SupportedCodePage; const builtin = @import("builtin"); const native_endian = builtin.cpu.arch.endian(); @@ -64,7 +65,7 @@ pub const Diagnostics = struct { defer std.debug.unlockStdErr(); const stderr = std.io.getStdErr().writer(); for (self.errors.items) |err_details| { - renderErrorMessage(self.allocator, stderr, tty_config, cwd, err_details, source, self.strings.items, source_mappings) catch return; + renderErrorMessage(stderr, tty_config, cwd, err_details, source, self.strings.items, source_mappings) catch return; } } @@ -94,32 +95,22 @@ pub const Diagnostics = struct { pub const DiagnosticsContext = struct { diagnostics: *Diagnostics, token: Token, + /// Code page of the source file at the token location + code_page: SupportedCodePage, }; pub const ErrorDetails = struct { err: Error, token: Token, + /// Code page of the source file at the token location + code_page: SupportedCodePage, /// If non-null, should be before `token`. If null, `token` is assumed to be the start. token_span_start: ?Token = null, /// If non-null, should be after `token`. If null, `token` is assumed to be the end. token_span_end: ?Token = null, type: Type = .err, print_source_line: bool = true, - extra: union { - none: void, - expected: Token.Id, - number: u32, - expected_types: ExpectedTypes, - resource: rc.Resource, - string_and_language: StringAndLanguage, - file_open_error: FileOpenError, - icon_read_error: IconReadError, - icon_dir: IconDirContext, - bmp_read_error: BitmapReadError, - accelerator_error: AcceleratorError, - statement_with_u16_param: StatementWithU16Param, - menu_or_class: enum { class, menu }, - } = .{ .none = {} }, + extra: Extra = .{ .none = {} }, pub const Type = enum { /// Fatal error, stops compilation @@ -137,9 +128,25 @@ pub const ErrorDetails = struct { hint, }; + pub const Extra = union { + none: void, + expected: Token.Id, + number: u32, + expected_types: ExpectedTypes, + resource: rc.ResourceType, + string_and_language: StringAndLanguage, + file_open_error: FileOpenError, + icon_read_error: IconReadError, + icon_dir: IconDirContext, + bmp_read_error: BitmapReadError, + accelerator_error: AcceleratorError, + statement_with_u16_param: StatementWithU16Param, + menu_or_class: enum { class, menu }, + }; + comptime { // all fields in the extra union should be 32 bits or less - for (std.meta.fields(std.meta.fieldInfo(ErrorDetails, .extra).type)) |field| { + for (std.meta.fields(Extra)) |field| { std.debug.assert(@bitSizeOf(field.type) <= 32); } } @@ -321,6 +328,8 @@ pub const ErrorDetails = struct { close_paren_expression, unary_plus_expression, rc_could_miscompile_control_params, + dangling_literal_at_eof, + disjoint_code_page, // Compiler /// `string_and_language` is populated @@ -331,6 +340,7 @@ pub const ErrorDetails = struct { /// `accelerator_error` is populated invalid_accelerator_key, accelerator_type_required, + accelerator_shift_or_control_without_virtkey, rc_would_miscompile_control_padding, rc_would_miscompile_control_class_ordinal, /// `icon_dir` is populated @@ -356,11 +366,6 @@ pub const ErrorDetails = struct { /// `number` is populated and contains a string index for which the string contains /// the bytes of a `u64` (native endian). The `u64` contains the number of miscompiled bytes. rc_would_miscompile_bmp_palette_padding, - /// `number` is populated and contains a string index for which the string contains - /// the bytes of two `u64`s (native endian). The first contains the number of missing - /// palette bytes and the second contains the max number of missing palette bytes. - /// If type is `.note`, then `extra` is `none`. - bmp_too_many_missing_palette_bytes, resource_header_size_exceeds_max, resource_data_size_exceeds_max, control_extra_data_size_exceeds_max, @@ -383,15 +388,16 @@ pub const ErrorDetails = struct { rc_would_miscompile_dialog_menu_or_class_id_forced_ordinal, rc_would_miscompile_dialog_menu_id_starts_with_digit, dialog_menu_id_was_uppercased, - /// `menu_or_class` is populated and contains the type of the parameter statement - duplicate_menu_or_class_skipped, + duplicate_optional_statement_skipped, invalid_digit_character_in_ordinal, // Literals /// `number` is populated - rc_would_miscompile_codepoint_byte_swap, + rc_would_miscompile_codepoint_whitespace, /// `number` is populated rc_would_miscompile_codepoint_skip, + /// `number` is populated + rc_would_miscompile_codepoint_bom, tab_converted_to_spaces, // General (used in various places) @@ -403,10 +409,50 @@ pub const ErrorDetails = struct { failed_to_open_cwd, }; + fn formatToken( + ctx: TokenFormatContext, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + writer: anytype, + ) !void { + _ = fmt; + _ = options; + + switch (ctx.token.id) { + .eof => return writer.writeAll(ctx.token.id.nameForErrorDisplay()), + else => {}, + } + + const slice = ctx.token.slice(ctx.source); + var src_i: usize = 0; + while (src_i < slice.len) { + const codepoint = ctx.code_page.codepointAt(src_i, slice) orelse break; + defer src_i += codepoint.byte_len; + const display_codepoint = codepointForDisplay(codepoint) orelse continue; + var buf: [4]u8 = undefined; + const utf8_len = std.unicode.utf8Encode(display_codepoint, &buf) catch unreachable; + try writer.writeAll(buf[0..utf8_len]); + } + } + + const TokenFormatContext = struct { + token: Token, + source: []const u8, + code_page: SupportedCodePage, + }; + + fn fmtToken(self: ErrorDetails, source: []const u8) std.fmt.Formatter(formatToken) { + return .{ .data = .{ + .token = self.token, + .code_page = self.code_page, + .source = source, + } }; + } + pub fn render(self: ErrorDetails, writer: anytype, source: []const u8, strings: []const []const u8) !void { switch (self.err) { .unfinished_string_literal => { - return writer.print("unfinished string literal at '{s}', expected closing '\"'", .{self.token.nameForErrorDisplay(source)}); + return writer.print("unfinished string literal at '{s}', expected closing '\"'", .{self.fmtToken(source)}); }, .string_literal_too_long => { return writer.print("string literal too long (max is currently {} characters)", .{self.extra.number}); @@ -474,33 +520,33 @@ pub const ErrorDetails = struct { number_slice.len += 1; } const number = std.fmt.parseUnsigned(u16, number_slice, 10) catch unreachable; - const code_page = CodePage.getByIdentifier(number) catch unreachable; + const code_page = code_pages.getByIdentifier(number) catch unreachable; // TODO: Improve or maybe add a note making it more clear that the code page // is valid and that the code page is unsupported purely due to a limitation // in this compiler. return writer.print("unsupported code page '{s} (id={})' in #pragma code_page", .{ @tagName(code_page), number }); }, .unfinished_raw_data_block => { - return writer.print("unfinished raw data block at '{s}', expected closing '}}' or 'END'", .{self.token.nameForErrorDisplay(source)}); + return writer.print("unfinished raw data block at '{s}', expected closing '}}' or 'END'", .{self.fmtToken(source)}); }, .unfinished_string_table_block => { - return writer.print("unfinished STRINGTABLE block at '{s}', expected closing '}}' or 'END'", .{self.token.nameForErrorDisplay(source)}); + return writer.print("unfinished STRINGTABLE block at '{s}', expected closing '}}' or 'END'", .{self.fmtToken(source)}); }, .expected_token => { - return writer.print("expected '{s}', got '{s}'", .{ self.extra.expected.nameForErrorDisplay(), self.token.nameForErrorDisplay(source) }); + return writer.print("expected '{s}', got '{s}'", .{ self.extra.expected.nameForErrorDisplay(), self.fmtToken(source) }); }, .expected_something_else => { try writer.writeAll("expected "); try self.extra.expected_types.writeCommaSeparated(writer); - return writer.print("; got '{s}'", .{self.token.nameForErrorDisplay(source)}); + return writer.print("; got '{s}'", .{self.fmtToken(source)}); }, .resource_type_cant_use_raw_data => switch (self.type) { - .err, .warning => try writer.print("expected '', found '{s}' (resource type '{s}' can't use raw data)", .{ self.token.nameForErrorDisplay(source), self.extra.resource.nameForErrorDisplay() }), - .note => try writer.print("if '{s}' is intended to be a filename, it must be specified as a quoted string literal", .{self.token.nameForErrorDisplay(source)}), + .err, .warning => try writer.print("expected '', found '{s}' (resource type '{s}' can't use raw data)", .{ self.fmtToken(source), self.extra.resource.nameForErrorDisplay() }), + .note => try writer.print("if '{s}' is intended to be a filename, it must be specified as a quoted string literal", .{self.fmtToken(source)}), .hint => return, }, .id_must_be_ordinal => { - try writer.print("id of resource type '{s}' must be an ordinal (u16), got '{s}'", .{ self.extra.resource.nameForErrorDisplay(), self.token.nameForErrorDisplay(source) }); + try writer.print("id of resource type '{s}' must be an ordinal (u16), got '{s}'", .{ self.extra.resource.nameForErrorDisplay(), self.fmtToken(source) }); }, .name_or_id_not_allowed => { try writer.print("name or id is not allowed for resource type '{s}'", .{self.extra.resource.nameForErrorDisplay()}); @@ -516,7 +562,7 @@ pub const ErrorDetails = struct { try writer.writeAll("ASCII character not equivalent to virtual key code"); }, .empty_menu_not_allowed => { - try writer.print("empty menu of type '{s}' not allowed", .{self.token.nameForErrorDisplay(source)}); + try writer.print("empty menu of type '{s}' not allowed", .{self.fmtToken(source)}); }, .rc_would_miscompile_version_value_padding => switch (self.type) { .err, .warning => return writer.print("the padding before this quoted string value would be miscompiled by the Win32 RC compiler", .{}), @@ -570,19 +616,18 @@ pub const ErrorDetails = struct { .note => return writer.print("to avoid the potential miscompilation, consider adding a comma after the style parameter", .{}), .hint => return, }, + .dangling_literal_at_eof => { + try writer.writeAll("dangling literal at end-of-file; this is not a problem, but it is likely a mistake"); + }, + .disjoint_code_page => switch (self.type) { + .err, .warning => return writer.print("#pragma code_page as the first thing in the .rc script can cause the input and output code pages to become out-of-sync", .{}), + .note => return writer.print("to avoid unexpected behavior, add a comment (or anything else) above the #pragma code_page line", .{}), + .hint => return, + }, .string_already_defined => switch (self.type) { .err, .warning => { - const language_id = self.extra.string_and_language.language.asInt(); - const language_name = language_name: { - if (std.meta.intToEnum(lang.LanguageId, language_id)) |lang_enum_val| { - break :language_name @tagName(lang_enum_val); - } else |_| {} - if (language_id == lang.LOCALE_CUSTOM_UNSPECIFIED) { - break :language_name "LOCALE_CUSTOM_UNSPECIFIED"; - } - break :language_name ""; - }; - return writer.print("string with id {d} (0x{X}) already defined for language {s} (0x{X})", .{ self.extra.string_and_language.id, self.extra.string_and_language.id, language_name, language_id }); + const language = self.extra.string_and_language.language; + return writer.print("string with id {d} (0x{X}) already defined for language {}", .{ self.extra.string_and_language.id, self.extra.string_and_language.id, language }); }, .note => return writer.print("previous definition of string with id {d} (0x{X}) here", .{ self.extra.string_and_language.id, self.extra.string_and_language.id }), .hint => return, @@ -597,14 +642,17 @@ pub const ErrorDetails = struct { try writer.print("unable to open file '{s}': {s}", .{ strings[self.extra.file_open_error.filename_string_index], @tagName(self.extra.file_open_error.err) }); }, .invalid_accelerator_key => { - try writer.print("invalid accelerator key '{s}': {s}", .{ self.token.nameForErrorDisplay(source), @tagName(self.extra.accelerator_error.err) }); + try writer.print("invalid accelerator key '{s}': {s}", .{ self.fmtToken(source), @tagName(self.extra.accelerator_error.err) }); }, .accelerator_type_required => { - try writer.print("accelerator type [ASCII or VIRTKEY] required when key is an integer", .{}); + try writer.writeAll("accelerator type [ASCII or VIRTKEY] required when key is an integer"); + }, + .accelerator_shift_or_control_without_virtkey => { + try writer.writeAll("SHIFT or CONTROL used without VIRTKEY"); }, .rc_would_miscompile_control_padding => switch (self.type) { .err, .warning => return writer.print("the padding before this control would be miscompiled by the Win32 RC compiler (it would insert 2 extra bytes of padding)", .{}), - .note => return writer.print("to avoid the potential miscompilation, consider removing any 'control data' blocks from the controls in this dialog", .{}), + .note => return writer.print("to avoid the potential miscompilation, consider adding one more byte to the control data of the control preceding this one", .{}), .hint => return, }, .rc_would_miscompile_control_class_ordinal => switch (self.type) { @@ -625,7 +673,7 @@ pub const ErrorDetails = struct { try writer.print("resource with format '{s}' (at index {}) is not allowed in {s} resource groups", .{ @tagName(self.extra.icon_dir.icon_format), self.extra.icon_dir.index, @tagName(self.extra.icon_dir.icon_type) }); }, .icon_dir_and_resource_type_mismatch => { - const unexpected_type: rc.Resource = if (self.extra.resource == .icon) .cursor else .icon; + const unexpected_type: rc.ResourceType = if (self.extra.resource == .icon) .cursor else .icon; // TODO: Better wording try writer.print("resource type '{s}' does not match type '{s}' specified in the file", .{ self.extra.resource.nameForErrorDisplay(), unexpected_type.nameForErrorDisplay() }); }, @@ -663,23 +711,15 @@ pub const ErrorDetails = struct { .bmp_missing_palette_bytes => { const bytes = strings[self.extra.number]; const missing_bytes = std.mem.readInt(u64, bytes[0..8], native_endian); - try writer.print("bitmap has {d} missing color palette bytes which will be padded with zeroes", .{missing_bytes}); + try writer.print("bitmap has {d} missing color palette bytes", .{missing_bytes}); }, .rc_would_miscompile_bmp_palette_padding => { - const bytes = strings[self.extra.number]; - const miscompiled_bytes = std.mem.readInt(u64, bytes[0..8], native_endian); - try writer.print("the missing color palette bytes would be miscompiled by the Win32 RC compiler (the added padding bytes would include {d} bytes of the pixel data)", .{miscompiled_bytes}); - }, - .bmp_too_many_missing_palette_bytes => switch (self.type) { - .err, .warning => { + try writer.writeAll("the Win32 RC compiler would erroneously pad out the missing bytes"); + if (self.extra.number != 0) { const bytes = strings[self.extra.number]; - const missing_bytes = std.mem.readInt(u64, bytes[0..8], native_endian); - const max_missing_bytes = std.mem.readInt(u64, bytes[8..16], native_endian); - try writer.print("bitmap has {} missing color palette bytes which exceeds the maximum of {}", .{ missing_bytes, max_missing_bytes }); - }, - // TODO: command line option - .note => try writer.writeAll("the maximum number of missing color palette bytes is configurable via <>"), - .hint => return, + const miscompiled_bytes = std.mem.readInt(u64, bytes[0..8], native_endian); + try writer.print(" (and the added padding bytes would include {d} bytes of the pixel data)", .{miscompiled_bytes}); + } }, .resource_header_size_exceeds_max => { try writer.print("resource's header length exceeds maximum of {} bytes", .{std.math.maxInt(u32)}); @@ -749,23 +789,22 @@ pub const ErrorDetails = struct { .hint => return, }, .dialog_menu_id_was_uppercased => return, - .duplicate_menu_or_class_skipped => { - return writer.print("this {s} was ignored; when multiple {s} statements are specified, only the last takes precedence", .{ - @tagName(self.extra.menu_or_class), - @tagName(self.extra.menu_or_class), - }); + .duplicate_optional_statement_skipped => { + return writer.writeAll("this statement was ignored; when multiple statements of the same type are specified, only the last takes precedence"); }, .invalid_digit_character_in_ordinal => { return writer.writeAll("non-ASCII digit characters are not allowed in ordinal (number) values"); }, - .rc_would_miscompile_codepoint_byte_swap => switch (self.type) { - .err, .warning => return writer.print("codepoint U+{X} within a string literal would be miscompiled by the Win32 RC compiler (the bytes of the UTF-16 code unit would be swapped)", .{self.extra.number}), - .note => return writer.print("to avoid the potential miscompilation, an integer escape sequence in a wide string literal could be used instead: L\"\\x{X}\"", .{self.extra.number}), - .hint => return, + .rc_would_miscompile_codepoint_whitespace => { + const treated_as = self.extra.number >> 8; + return writer.print("codepoint U+{X:0>4} within a string literal would be miscompiled by the Win32 RC compiler (it would get treated as U+{X:0>4})", .{ self.extra.number, treated_as }); }, - .rc_would_miscompile_codepoint_skip => switch (self.type) { - .err, .warning => return writer.print("codepoint U+{X} within a string literal would be miscompiled by the Win32 RC compiler (the codepoint would be missing from the compiled resource)", .{self.extra.number}), - .note => return writer.print("to avoid the potential miscompilation, an integer escape sequence in a wide string literal could be used instead: L\"\\x{X}\"", .{self.extra.number}), + .rc_would_miscompile_codepoint_skip => { + return writer.print("codepoint U+{X:0>4} within a string literal would be miscompiled by the Win32 RC compiler (the codepoint would be missing from the compiled resource)", .{self.extra.number}); + }, + .rc_would_miscompile_codepoint_bom => switch (self.type) { + .err, .warning => return writer.print("codepoint U+{X:0>4} within a string literal would cause the entire file to be miscompiled by the Win32 RC compiler", .{self.extra.number}), + .note => return writer.writeAll("the presence of this codepoint causes all non-ASCII codepoints to be byteswapped by the Win32 RC preprocessor"), .hint => return, }, .tab_converted_to_spaces => switch (self.type) { @@ -790,14 +829,7 @@ pub const ErrorDetails = struct { after_len: usize, }; - pub fn visualTokenInfo(self: ErrorDetails, source_line_start: usize, source_line_end: usize) VisualTokenInfo { - // Note: A perfect solution here would involve full grapheme cluster - // awareness, but oh well. This will give incorrect offsets - // if there are any multibyte codepoints within the relevant span, - // and even more inflated for grapheme clusters. - // - // We mitigate this slightly when we know we'll be pointing at - // something that displays as 1 character. + pub fn visualTokenInfo(self: ErrorDetails, source_line_start: usize, source_line_end: usize, source: []const u8) VisualTokenInfo { return switch (self.err) { // These can technically be more than 1 byte depending on encoding, // but they always refer to one visual character/grapheme. @@ -808,27 +840,65 @@ pub const ErrorDetails = struct { .illegal_private_use_character, => .{ .before_len = 0, - .point_offset = self.token.start - source_line_start, + .point_offset = cellCount(self.code_page, source, source_line_start, self.token.start), .after_len = 0, }, else => .{ .before_len = before: { const start = @max(source_line_start, if (self.token_span_start) |span_start| span_start.start else self.token.start); - break :before self.token.start - start; + break :before cellCount(self.code_page, source, start, self.token.start); }, - .point_offset = self.token.start - source_line_start, + .point_offset = cellCount(self.code_page, source, source_line_start, self.token.start), .after_len = after: { const end = @min(source_line_end, if (self.token_span_end) |span_end| span_end.end else self.token.end); // end may be less than start when pointing to EOF if (end <= self.token.start) break :after 0; - break :after end - self.token.start - 1; + break :after cellCount(self.code_page, source, self.token.start, end) - 1; }, }, }; } }; -pub fn renderErrorMessage(allocator: std.mem.Allocator, writer: anytype, tty_config: std.io.tty.Config, cwd: std.fs.Dir, err_details: ErrorDetails, source: []const u8, strings: []const []const u8, source_mappings: ?SourceMappings) !void { +/// Convenience struct only useful when the code page can be inferred from the token +pub const ErrorDetailsWithoutCodePage = blk: { + const details_info = @typeInfo(ErrorDetails); + const fields = details_info.@"struct".fields; + var fields_without_codepage: [fields.len - 1]std.builtin.Type.StructField = undefined; + var i: usize = 0; + for (fields) |field| { + if (std.mem.eql(u8, field.name, "code_page")) continue; + fields_without_codepage[i] = field; + i += 1; + } + std.debug.assert(i == fields_without_codepage.len); + break :blk @Type(.{ .@"struct" = .{ + .layout = .auto, + .fields = &fields_without_codepage, + .decls = &.{}, + .is_tuple = false, + } }); +}; + +fn cellCount(code_page: SupportedCodePage, source: []const u8, start_index: usize, end_index: usize) usize { + // Note: This is an imperfect solution. A proper implementation here would + // involve full grapheme cluster awareness + grapheme width data, but oh well. + var codepoint_count: usize = 0; + var index: usize = start_index; + while (index < end_index) { + const codepoint = code_page.codepointAt(index, source) orelse break; + defer index += codepoint.byte_len; + _ = codepointForDisplay(codepoint) orelse continue; + codepoint_count += 1; + // no need to count more than we will display + if (codepoint_count >= max_source_line_codepoints + truncated_str.len) break; + } + return codepoint_count; +} + +const truncated_str = "<...truncated...>"; + +pub fn renderErrorMessage(writer: anytype, tty_config: std.io.tty.Config, cwd: std.fs.Dir, err_details: ErrorDetails, source: []const u8, strings: []const []const u8, source_mappings: ?SourceMappings) !void { if (err_details.type == .hint) return; const source_line_start = err_details.token.getLineStartForErrorDisplay(source); @@ -884,45 +954,61 @@ pub fn renderErrorMessage(allocator: std.mem.Allocator, writer: anytype, tty_con } const source_line = err_details.token.getLineForErrorDisplay(source, source_line_start); - const visual_info = err_details.visualTokenInfo(source_line_start, source_line_start + source_line.len); + const visual_info = err_details.visualTokenInfo(source_line_start, source_line_start + source_line.len, source); + const truncated_visual_info = ErrorDetails.VisualTokenInfo{ + .before_len = if (visual_info.point_offset > max_source_line_codepoints and visual_info.before_len > 0) + (visual_info.before_len + 1) -| (visual_info.point_offset - max_source_line_codepoints) + else + visual_info.before_len, + .point_offset = @min(max_source_line_codepoints + 1, visual_info.point_offset), + .after_len = if (visual_info.point_offset > max_source_line_codepoints) + @min(truncated_str.len - 3, visual_info.after_len) + else + @min(max_source_line_codepoints - visual_info.point_offset + (truncated_str.len - 2), visual_info.after_len), + }; // Need this to determine if the 'line originated from' note is worth printing - var source_line_for_display_buf = try std.ArrayList(u8).initCapacity(allocator, source_line.len); - defer source_line_for_display_buf.deinit(); - try writeSourceSlice(source_line_for_display_buf.writer(), source_line); - - // TODO: General handling of long lines, not tied to this specific error - if (err_details.err == .string_literal_too_long) { - const before_slice = source_line[0..@min(source_line.len, visual_info.point_offset + 16)]; - try writeSourceSlice(writer, before_slice); + var source_line_for_display_buf: [max_source_line_bytes]u8 = undefined; + const source_line_for_display = writeSourceSlice(&source_line_for_display_buf, source_line, err_details.code_page); + + try writer.writeAll(source_line_for_display.line); + if (source_line_for_display.truncated) { try tty_config.setColor(writer, .dim); - try writer.writeAll("<...truncated...>"); + try writer.writeAll(truncated_str); try tty_config.setColor(writer, .reset); - } else { - try writer.writeAll(source_line_for_display_buf.items); } try writer.writeByte('\n'); try tty_config.setColor(writer, .green); - const num_spaces = visual_info.point_offset - visual_info.before_len; + const num_spaces = truncated_visual_info.point_offset - truncated_visual_info.before_len; try writer.writeByteNTimes(' ', num_spaces); - try writer.writeByteNTimes('~', visual_info.before_len); + try writer.writeByteNTimes('~', truncated_visual_info.before_len); try writer.writeByte('^'); - if (visual_info.after_len > 0) { - var num_squiggles = visual_info.after_len; - if (err_details.err == .string_literal_too_long) { - num_squiggles = @min(num_squiggles, 15); - } - try writer.writeByteNTimes('~', num_squiggles); - } + try writer.writeByteNTimes('~', truncated_visual_info.after_len); try writer.writeByte('\n'); try tty_config.setColor(writer, .reset); if (corresponding_span != null and corresponding_file != null) { - var corresponding_lines = try CorrespondingLines.init(allocator, cwd, err_details, source_line_for_display_buf.items, corresponding_span.?, corresponding_file.?); - defer corresponding_lines.deinit(allocator); - - if (!corresponding_lines.worth_printing_note) return; + var worth_printing_lines: bool = true; + var initial_lines_err: ?anyerror = null; + var corresponding_lines: ?CorrespondingLines = CorrespondingLines.init( + cwd, + err_details, + source_line_for_display.line, + corresponding_span.?, + corresponding_file.?, + ) catch |err| switch (err) { + error.NotWorthPrintingLines => blk: { + worth_printing_lines = false; + break :blk null; + }, + error.NotWorthPrintingNote => return, + else => |e| blk: { + initial_lines_err = e; + break :blk null; + }, + }; + defer if (corresponding_lines) |*cl| cl.deinit(); try tty_config.setColor(writer, .bold); if (corresponding_file) |file| { @@ -947,85 +1033,222 @@ pub fn renderErrorMessage(allocator: std.mem.Allocator, writer: anytype, tty_con try writer.print(" of file '{s}'\n", .{corresponding_file.?}); try tty_config.setColor(writer, .reset); - if (!corresponding_lines.worth_printing_lines) return; - - if (corresponding_lines.lines_is_error_message) { + if (!worth_printing_lines) return; + + const write_lines_err: ?anyerror = write_lines: { + if (initial_lines_err) |err| break :write_lines err; + while (corresponding_lines.?.next() catch |err| { + break :write_lines err; + }) |display_line| { + try writer.writeAll(display_line.line); + if (display_line.truncated) { + try tty_config.setColor(writer, .dim); + try writer.writeAll(truncated_str); + try tty_config.setColor(writer, .reset); + } + try writer.writeByte('\n'); + } + break :write_lines null; + }; + if (write_lines_err) |err| { try tty_config.setColor(writer, .red); try writer.writeAll(" | "); try tty_config.setColor(writer, .reset); try tty_config.setColor(writer, .dim); - try writer.writeAll(corresponding_lines.lines.items); + try writer.print("unable to print line(s) from file: {s}\n", .{@errorName(err)}); try tty_config.setColor(writer, .reset); - try writer.writeAll("\n\n"); - return; } - - try writer.writeAll(corresponding_lines.lines.items); - try writer.writeAll("\n\n"); + try writer.writeByte('\n'); } } -const CorrespondingLines = struct { - worth_printing_note: bool = true, - worth_printing_lines: bool = true, - lines: std.ArrayListUnmanaged(u8) = .empty, - lines_is_error_message: bool = false, - - pub fn init(allocator: std.mem.Allocator, cwd: std.fs.Dir, err_details: ErrorDetails, lines_for_comparison: []const u8, corresponding_span: SourceMappings.CorrespondingSpan, corresponding_file: []const u8) !CorrespondingLines { - var corresponding_lines = CorrespondingLines{}; +const VisualLine = struct { + line: []u8, + truncated: bool, +}; +const CorrespondingLines = struct { + // enough room for one more codepoint, just so that we don't have to keep + // track of this being truncated, since the extra codepoint will ensure + // the visual line will need to truncate in that case. + line_buf: [max_source_line_bytes + 4]u8 = undefined, + line_len: usize = 0, + visual_line_buf: [max_source_line_bytes]u8 = undefined, + visual_line_len: usize = 0, + truncated: bool = false, + line_num: usize = 1, + initial_line: bool = true, + last_byte: u8 = 0, + at_eof: bool = false, + span: SourceMappings.CorrespondingSpan, + file: std.fs.File, + buffered_reader: BufferedReaderType, + code_page: SupportedCodePage, + + const BufferedReaderType = std.io.BufferedReader(512, std.fs.File.Reader); + + pub fn init(cwd: std.fs.Dir, err_details: ErrorDetails, line_for_comparison: []const u8, corresponding_span: SourceMappings.CorrespondingSpan, corresponding_file: []const u8) !CorrespondingLines { // We don't do line comparison for this error, so don't print the note if the line // number is different - if (err_details.err == .string_literal_too_long and err_details.token.line_number == corresponding_span.start_line) { - corresponding_lines.worth_printing_note = false; - return corresponding_lines; + if (err_details.err == .string_literal_too_long and err_details.token.line_number != corresponding_span.start_line) { + return error.NotWorthPrintingNote; } // Don't print the originating line for this error, we know it's really long if (err_details.err == .string_literal_too_long) { - corresponding_lines.worth_printing_lines = false; - return corresponding_lines; + return error.NotWorthPrintingLines; } - var writer = corresponding_lines.lines.writer(allocator); - if (utils.openFileNotDir(cwd, corresponding_file, .{})) |file| { - defer file.close(); - var buffered_reader = std.io.bufferedReader(file.reader()); - writeLinesFromStream(writer, buffered_reader.reader(), corresponding_span.start_line, corresponding_span.end_line) catch |err| switch (err) { - error.LinesNotFound => { - corresponding_lines.lines.clearRetainingCapacity(); - try writer.print("unable to print line(s) from file: {s}", .{@errorName(err)}); - corresponding_lines.lines_is_error_message = true; - return corresponding_lines; - }, - else => |e| return e, - }; - } else |err| { - corresponding_lines.lines.clearRetainingCapacity(); - try writer.print("unable to print line(s) from file: {s}", .{@errorName(err)}); - corresponding_lines.lines_is_error_message = true; - return corresponding_lines; - } + var corresponding_lines = CorrespondingLines{ + .span = corresponding_span, + .file = try utils.openFileNotDir(cwd, corresponding_file, .{}), + .buffered_reader = undefined, + .code_page = err_details.code_page, + }; + corresponding_lines.buffered_reader = BufferedReaderType{ + .unbuffered_reader = corresponding_lines.file.reader(), + }; + errdefer corresponding_lines.deinit(); + + var fbs = std.io.fixedBufferStream(&corresponding_lines.line_buf); + const writer = fbs.writer(); + + try corresponding_lines.writeLineFromStreamVerbatim( + writer, + corresponding_lines.buffered_reader.reader(), + corresponding_span.start_line, + ); + + const visual_line = writeSourceSlice( + &corresponding_lines.visual_line_buf, + corresponding_lines.line_buf[0..corresponding_lines.line_len], + err_details.code_page, + ); + corresponding_lines.visual_line_len = visual_line.line.len; + corresponding_lines.truncated = visual_line.truncated; // If the lines are the same as they were before preprocessing, skip printing the note entirely - if (std.mem.eql(u8, lines_for_comparison, corresponding_lines.lines.items)) { - corresponding_lines.worth_printing_note = false; + if (corresponding_span.start_line == corresponding_span.end_line and std.mem.eql( + u8, + line_for_comparison, + corresponding_lines.visual_line_buf[0..corresponding_lines.visual_line_len], + )) { + return error.NotWorthPrintingNote; } + return corresponding_lines; } - pub fn deinit(self: *CorrespondingLines, allocator: std.mem.Allocator) void { - self.lines.deinit(allocator); + pub fn next(self: *CorrespondingLines) !?VisualLine { + if (self.initial_line) { + self.initial_line = false; + return .{ + .line = self.visual_line_buf[0..self.visual_line_len], + .truncated = self.truncated, + }; + } + if (self.line_num > self.span.end_line) return null; + if (self.at_eof) return error.LinesNotFound; + + self.line_len = 0; + self.visual_line_len = 0; + + var fbs = std.io.fixedBufferStream(&self.line_buf); + const writer = fbs.writer(); + + try self.writeLineFromStreamVerbatim( + writer, + self.buffered_reader.reader(), + self.line_num, + ); + + const visual_line = writeSourceSlice( + &self.visual_line_buf, + self.line_buf[0..self.line_len], + self.code_page, + ); + self.visual_line_len = visual_line.line.len; + + return visual_line; + } + + fn writeLineFromStreamVerbatim(self: *CorrespondingLines, writer: anytype, input: anytype, line_num: usize) !void { + while (try readByteOrEof(input)) |byte| { + switch (byte) { + '\n', '\r' => { + if (!utils.isLineEndingPair(self.last_byte, byte)) { + const line_complete = self.line_num == line_num; + self.line_num += 1; + if (line_complete) { + self.last_byte = byte; + return; + } + } else { + // reset last_byte to a non-line ending so that + // consecutive CRLF pairs don't get treated as one + // long line ending 'pair' + self.last_byte = 0; + continue; + } + }, + else => { + if (self.line_num == line_num) { + if (writer.writeByte(byte)) { + self.line_len += 1; + } else |err| switch (err) { + error.NoSpaceLeft => {}, + else => |e| return e, + } + } + }, + } + self.last_byte = byte; + } + self.at_eof = true; + // hacky way to get next to return null + self.line_num += 1; + } + + fn readByteOrEof(reader: anytype) !?u8 { + return reader.readByte() catch |err| switch (err) { + error.EndOfStream => return null, + else => |e| return e, + }; + } + + pub fn deinit(self: *CorrespondingLines) void { + self.file.close(); } }; -fn writeSourceSlice(writer: anytype, slice: []const u8) !void { - for (slice) |c| try writeSourceByte(writer, c); +const max_source_line_codepoints = 120; +const max_source_line_bytes = max_source_line_codepoints * 4; + +fn writeSourceSlice(buf: []u8, slice: []const u8, code_page: SupportedCodePage) VisualLine { + var src_i: usize = 0; + var dest_i: usize = 0; + var codepoint_count: usize = 0; + while (src_i < slice.len) { + const codepoint = code_page.codepointAt(src_i, slice) orelse break; + defer src_i += codepoint.byte_len; + const display_codepoint = codepointForDisplay(codepoint) orelse continue; + codepoint_count += 1; + if (codepoint_count > max_source_line_codepoints) { + return .{ .line = buf[0..dest_i], .truncated = true }; + } + const utf8_len = std.unicode.utf8Encode(display_codepoint, buf[dest_i..]) catch unreachable; + dest_i += utf8_len; + } + return .{ .line = buf[0..dest_i], .truncated = false }; } -inline fn writeSourceByte(writer: anytype, byte: u8) !void { - switch (byte) { - '\x00'...'\x08', '\x0E'...'\x1F', '\x7F' => try writer.writeAll("�"), +fn codepointForDisplay(codepoint: code_pages.Codepoint) ?u21 { + return switch (codepoint.value) { + '\x00'...'\x08', + '\x0E'...'\x1F', + '\x7F', + code_pages.Codepoint.invalid, + => '�', // \r is seemingly ignored by the RC compiler so skipping it when printing source lines // could help avoid confusing output (e.g. RC\rDATA if printed verbatim would show up // in the console as DATA but the compiler reads it as RCDATA) @@ -1033,44 +1256,8 @@ inline fn writeSourceByte(writer: anytype, byte: u8) !void { // NOTE: This is irrelevant when using the clang preprocessor, because unpaired \r // characters get converted to \n, but may become relevant if another // preprocessor is used instead. - '\r' => {}, - '\t', '\x0B', '\x0C' => try writer.writeByte(' '), - else => try writer.writeByte(byte), - } -} - -pub fn writeLinesFromStream(writer: anytype, input: anytype, start_line: usize, end_line: usize) !void { - var line_num: usize = 1; - var last_byte: u8 = 0; - while (try readByteOrEof(input)) |byte| { - switch (byte) { - '\n', '\r' => { - if (!utils.isLineEndingPair(last_byte, byte)) { - if (line_num == end_line) return; - if (line_num >= start_line) try writeSourceByte(writer, byte); - line_num += 1; - } else { - // reset last_byte to a non-line ending so that - // consecutive CRLF pairs don't get treated as one - // long line ending 'pair' - last_byte = 0; - continue; - } - }, - else => { - if (line_num >= start_line) try writeSourceByte(writer, byte); - }, - } - last_byte = byte; - } - if (line_num != end_line) { - return error.LinesNotFound; - } -} - -pub fn readByteOrEof(reader: anytype) !?u8 { - return reader.readByte() catch |err| switch (err) { - error.EndOfStream => return null, - else => |e| return e, + '\r' => null, + '\t', '\x0B', '\x0C' => ' ', + else => |v| v, }; } diff --git a/lib/compiler/resinator/lang.zig b/lib/compiler/resinator/lang.zig index bf3668071007..7cc69f1cf1ed 100644 --- a/lib/compiler/resinator/lang.zig +++ b/lib/compiler/resinator/lang.zig @@ -119,7 +119,7 @@ test tagToId { } test "exhaustive tagToId" { - inline for (@typeInfo(LanguageId).Enum.fields) |field| { + inline for (@typeInfo(LanguageId).@"enum".fields) |field| { const id = tagToId(field.name) catch |err| { std.debug.print("tag: {s}\n", .{field.name}); return err; diff --git a/lib/compiler/resinator/lex.zig b/lib/compiler/resinator/lex.zig index 91ebba467dc0..cfb75e4c5b53 100644 --- a/lib/compiler/resinator/lex.zig +++ b/lib/compiler/resinator/lex.zig @@ -8,7 +8,7 @@ const std = @import("std"); const ErrorDetails = @import("errors.zig").ErrorDetails; const columnWidth = @import("literals.zig").columnWidth; const code_pages = @import("code_pages.zig"); -const CodePage = code_pages.CodePage; +const SupportedCodePage = code_pages.SupportedCodePage; const SourceMappings = @import("source_mapping.zig").SourceMappings; const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit; @@ -62,13 +62,6 @@ pub const Token = struct { return buffer[self.start..self.end]; } - pub fn nameForErrorDisplay(self: Token, buffer: []const u8) []const u8 { - return switch (self.id) { - .eof => self.id.nameForErrorDisplay(), - else => self.slice(buffer), - }; - } - /// Returns 0-based column pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize { const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source); @@ -214,18 +207,19 @@ pub const Lexer = struct { line_handler: LineHandler, at_start_of_line: bool = true, error_context_token: ?Token = null, - current_code_page: CodePage, - default_code_page: CodePage, + current_code_page: SupportedCodePage, + default_code_page: SupportedCodePage, source_mappings: ?*SourceMappings, max_string_literal_codepoints: u15, /// Needed to determine whether or not the output code page should /// be set in the parser. seen_pragma_code_pages: u2 = 0, + last_pragma_code_page_token: ?Token = null, pub const Error = LexError; pub const LexerOptions = struct { - default_code_page: CodePage = .windows1252, + default_code_page: SupportedCodePage = .windows1252, source_mappings: ?*SourceMappings = null, max_string_literal_codepoints: u15 = default_max_string_literal_codepoints, }; @@ -291,6 +285,8 @@ pub const Lexer = struct { }, // NBSP only counts as whitespace at the start of a line (but // can be intermixed with other whitespace). Who knows why. + // TODO: This should either be removed, or it should also include + // the codepoints listed in disjoint_code_page.zig '\xA0' => if (self.at_start_of_line) { result.start = self.index + codepoint.byte_len; } else { @@ -305,12 +301,8 @@ pub const Lexer = struct { } self.at_start_of_line = false; }, - // Semi-colon acts as a line-terminator, but in this lexing mode - // that's only true if it's at the start of a line. ';' => { - if (self.at_start_of_line) { - state = .semicolon; - } + state = .semicolon; self.at_start_of_line = false; }, else => { @@ -345,7 +337,11 @@ pub const Lexer = struct { } } else { // got EOF switch (state) { - .start, .semicolon => {}, + .start => {}, + .semicolon => { + // Skip past everything up to the EOF + result.start = self.index; + }, .literal => { result.id = .literal; }, @@ -357,6 +353,10 @@ pub const Lexer = struct { } result.end = self.index; + + // EOF tokens must have their start index match the end index + std.debug.assert(result.id != .eof or result.start == result.end); + return result; } @@ -796,7 +796,11 @@ pub const Lexer = struct { } } else { // got EOF switch (state) { - .start, .semicolon => {}, + .start => {}, + .semicolon => { + // Skip past everything up to the EOF + result.start = self.index; + }, .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => { result.id = .literal; }, @@ -835,6 +839,9 @@ pub const Lexer = struct { } } + // EOF tokens must have their start index match the end index + std.debug.assert(result.id != .eof or result.start == result.end); + return result; } @@ -878,7 +885,7 @@ pub const Lexer = struct { // and miscompilations when used within string literals. We avoid the miscompilation // within string literals and emit a warning, but outside of string literals it makes // more sense to just disallow these codepoints. - 0x900, 0xA00, 0xA0D, 0x2000, 0xFFFE, 0xD00 => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return, + 0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return, else => return, }; self.error_context_token = .{ @@ -899,90 +906,11 @@ pub const Lexer = struct { }; errdefer self.error_context_token = token; const full_command = self.buffer[start..end]; - var command = full_command; - - // Anything besides exactly this is ignored by the Windows RC implementation - const expected_directive = "#pragma"; - if (!std.mem.startsWith(u8, command, expected_directive)) return; - command = command[expected_directive.len..]; - - if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return; - while (command.len > 0 and std.ascii.isWhitespace(command[0])) { - command = command[1..]; - } - - // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation, - // and it will error with 'Missing left parenthesis in code_page #pragma' - const expected_extension = "code_page"; - if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return; - command = command[expected_extension.len..]; - - while (command.len > 0 and std.ascii.isWhitespace(command[0])) { - command = command[1..]; - } - - if (command.len == 0 or command[0] != '(') { - return error.CodePagePragmaMissingLeftParen; - } - command = command[1..]; - while (command.len > 0 and std.ascii.isWhitespace(command[0])) { - command = command[1..]; - } - - var num_str: []u8 = command[0..0]; - while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) { - command = command[1..]; - num_str.len += 1; - } - - if (num_str.len == 0) { - return error.CodePagePragmaNotInteger; - } - - while (command.len > 0 and std.ascii.isWhitespace(command[0])) { - command = command[1..]; - } - - if (command.len == 0 or command[0] != ')') { - return error.CodePagePragmaMissingRightParen; - } - - const code_page = code_page: { - if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) { - break :code_page self.default_code_page; - } - - // The Win32 compiler behaves fairly strangely around maxInt(u32): - // - If the overflowed u32 wraps and becomes a known code page ID, then - // it will error/warn with "Codepage not valid: ignored" (depending on /w) - // - If the overflowed u32 wraps and does not become a known code page ID, - // then it will error with 'constant too big' and 'Codepage not integer' - // - // Instead of that, we just have a separate error specifically for overflow. - const num = parseCodePageNum(num_str) catch |err| switch (err) { - error.InvalidCharacter => return error.CodePagePragmaNotInteger, - error.Overflow => return error.CodePagePragmaOverflow, - }; - - // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252 - if (num_str[0] == '0' and num != 0) { - return error.CodePagePragmaInvalidCodePage; - } - // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation. - else if (num == 0) { - return error.CodePagePragmaNotInteger; - } - // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16. - if (num > std.math.maxInt(u16)) { - return error.CodePagePragmaInvalidCodePage; - } - - break :code_page code_pages.CodePage.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) { - error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage, - error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage, - }; - }; + const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) { + error.NotPragma, error.NotCodePagePragma => return, + else => |e| return e, + }) orelse self.default_code_page; // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives // > This pragma is not supported in an included resource file (.rc) @@ -998,24 +926,16 @@ pub const Lexer = struct { } self.seen_pragma_code_pages +|= 1; + self.last_pragma_code_page_token = token; self.current_code_page = code_page; } - fn parseCodePageNum(str: []const u8) !u32 { - var x: u32 = 0; - for (str) |c| { - const digit = try std.fmt.charToDigit(c, 10); - if (x != 0) x = try std.math.mul(u32, x, 10); - x = try std.math.add(u32, x, digit); - } - return x; - } - pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails { const err = switch (lex_err) { error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal, error.StringLiteralTooLong => return .{ .err = .string_literal_too_long, + .code_page = self.current_code_page, .token = self.error_context_token.?, .extra = .{ .number = self.max_string_literal_codepoints }, }, @@ -1037,11 +957,112 @@ pub const Lexer = struct { }; return .{ .err = err, + .code_page = self.current_code_page, .token = self.error_context_token.?, }; } }; +fn parseCodePageNum(str: []const u8) !u32 { + var x: u32 = 0; + for (str) |c| { + const digit = try std.fmt.charToDigit(c, 10); + if (x != 0) x = try std.math.mul(u32, x, 10); + x = try std.math.add(u32, x, digit); + } + return x; +} + +/// Returns `null` when the code_page is set to DEFAULT +pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage { + var command = full_command; + + // Anything besides exactly this is ignored by the Windows RC implementation + const expected_directive = "#pragma"; + if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma; + command = command[expected_directive.len..]; + + if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma; + while (command.len > 0 and std.ascii.isWhitespace(command[0])) { + command = command[1..]; + } + + // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation, + // and it will error with 'Missing left parenthesis in code_page #pragma' + const expected_extension = "code_page"; + if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma; + command = command[expected_extension.len..]; + + while (command.len > 0 and std.ascii.isWhitespace(command[0])) { + command = command[1..]; + } + + if (command.len == 0 or command[0] != '(') { + return error.CodePagePragmaMissingLeftParen; + } + command = command[1..]; + + while (command.len > 0 and std.ascii.isWhitespace(command[0])) { + command = command[1..]; + } + + var num_str: []u8 = command[0..0]; + while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) { + command = command[1..]; + num_str.len += 1; + } + + if (num_str.len == 0) { + return error.CodePagePragmaNotInteger; + } + + while (command.len > 0 and std.ascii.isWhitespace(command[0])) { + command = command[1..]; + } + + if (command.len == 0 or command[0] != ')') { + return error.CodePagePragmaMissingRightParen; + } + + const code_page: ?SupportedCodePage = code_page: { + if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) { + break :code_page null; + } + + // The Win32 compiler behaves fairly strangely around maxInt(u32): + // - If the overflowed u32 wraps and becomes a known code page ID, then + // it will error/warn with "Codepage not valid: ignored" (depending on /w) + // - If the overflowed u32 wraps and does not become a known code page ID, + // then it will error with 'constant too big' and 'Codepage not integer' + // + // Instead of that, we just have a separate error specifically for overflow. + const num = parseCodePageNum(num_str) catch |err| switch (err) { + error.InvalidCharacter => return error.CodePagePragmaNotInteger, + error.Overflow => return error.CodePagePragmaOverflow, + }; + + // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252 + if (num_str[0] == '0' and num != 0) { + return error.CodePagePragmaInvalidCodePage; + } + // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation. + else if (num == 0) { + return error.CodePagePragmaNotInteger; + } + // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16. + if (num > std.math.maxInt(u16)) { + return error.CodePagePragmaInvalidCodePage; + } + + break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) { + error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage, + error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage, + }; + }; + + return code_page; +} + fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void { var lexer = Lexer.init(source, .{}); if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer}); @@ -1074,7 +1095,7 @@ test "normal: string literals" { test "superscript chars and code pages" { const firstToken = struct { - pub fn firstToken(source: []const u8, default_code_page: CodePage, comptime lex_method: Lexer.LexMethod) LexError!Token { + pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token { var lexer = Lexer.init(source, .{ .default_code_page = default_code_page }); return lexer.next(lex_method); } diff --git a/lib/compiler/resinator/literals.zig b/lib/compiler/resinator/literals.zig index b653e08bd818..ca2d353daa85 100644 --- a/lib/compiler/resinator/literals.zig +++ b/lib/compiler/resinator/literals.zig @@ -1,6 +1,6 @@ const std = @import("std"); const code_pages = @import("code_pages.zig"); -const CodePage = code_pages.CodePage; +const SupportedCodePage = code_pages.SupportedCodePage; const windows1252 = @import("windows1252.zig"); const ErrorDetails = @import("errors.zig").ErrorDetails; const DiagnosticsContext = @import("errors.zig").DiagnosticsContext; @@ -18,7 +18,7 @@ pub fn isValidNumberDataLiteral(str: []const u8) bool { pub const SourceBytes = struct { slice: []const u8, - code_page: CodePage, + code_page: SupportedCodePage, }; pub const StringType = enum { ascii, wide }; @@ -53,7 +53,7 @@ pub const StringType = enum { ascii, wide }; /// branches should never actually be hit during this function. pub const IterativeStringParser = struct { source: []const u8, - code_page: CodePage, + code_page: SupportedCodePage, /// The type of the string inferred by the prefix (L"" or "") /// This is what matters for things like the maximum digits in an /// escape sequence, whether or not invalid escape sequences are skipped, etc. @@ -98,32 +98,55 @@ pub const IterativeStringParser = struct { pub const ParsedCodepoint = struct { codepoint: u21, - /// Note: If this is true, `codepoint` will be a value with a max of maxInt(u16). - /// This is enforced by using saturating arithmetic, so in e.g. a wide string literal the - /// octal escape sequence \7777777 (2,097,151) will be parsed into the value 0xFFFF (65,535). - /// If the value needs to be truncated to a smaller integer (for ASCII string literals), then that - /// must be done by the caller. + /// Note: If this is true, `codepoint` will have an effective maximum value + /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16. + /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string + /// literals), then that must be done by the caller. from_escaped_integer: bool = false, + /// Denotes that the codepoint is: + /// - Escaped (has a \ in front of it), and + /// - Has a value >= U+10000, meaning it would be encoded as a surrogate + /// pair in UTF-16, and + /// - Is part of a wide string literal + /// + /// Normally in wide string literals, invalid escapes are omitted + /// during parsing (the codepoints are not returned at all during + /// the `next` call), but this is a special case in which the + /// escape only applies to the high surrogate pair of the codepoint. + /// + /// TODO: Maybe just return the low surrogate codepoint by itself in this case. + escaped_surrogate_pair: bool = false, }; pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint { const result = try self.nextUnchecked(); if (self.diagnostics != null and result != null and !result.?.from_escaped_integer) { switch (result.?.codepoint) { - 0x900, 0xA00, 0xA0D, 0x2000, 0xFFFE, 0xD00 => { + 0x0900, 0x0A00, 0x0A0D, 0x2000, 0x0D00 => { const err: ErrorDetails.Error = if (result.?.codepoint == 0xD00) .rc_would_miscompile_codepoint_skip else - .rc_would_miscompile_codepoint_byte_swap; + .rc_would_miscompile_codepoint_whitespace; try self.diagnostics.?.diagnostics.append(ErrorDetails{ .err = err, .type = .warning, + .code_page = self.code_page, .token = self.diagnostics.?.token, .extra = .{ .number = result.?.codepoint }, }); + }, + 0xFFFE, 0xFFFF => { try self.diagnostics.?.diagnostics.append(ErrorDetails{ - .err = err, + .err = .rc_would_miscompile_codepoint_bom, + .type = .warning, + .code_page = self.code_page, + .token = self.diagnostics.?.token, + .extra = .{ .number = result.?.codepoint }, + }); + try self.diagnostics.?.diagnostics.append(ErrorDetails{ + .err = .rc_would_miscompile_codepoint_bom, .type = .note, + .code_page = self.code_page, .token = self.diagnostics.?.token, .print_source_line = false, .extra = .{ .number = result.?.codepoint }, @@ -188,11 +211,13 @@ pub const IterativeStringParser = struct { try self.diagnostics.?.diagnostics.append(ErrorDetails{ .err = .tab_converted_to_spaces, .type = .warning, + .code_page = self.code_page, .token = self.diagnostics.?.token, }); try self.diagnostics.?.diagnostics.append(ErrorDetails{ .err = .tab_converted_to_spaces, .type = .note, + .code_page = self.code_page, .token = self.diagnostics.?.token, .print_source_line = false, }); @@ -246,8 +271,9 @@ pub const IterativeStringParser = struct { switch (c) { 'a', 'A' => { self.index += codepoint.byte_len; + // might be a bug in RC, but matches its behavior return .{ .codepoint = '\x08' }; - }, // might be a bug in RC, but matches its behavior + }, 'n' => { self.index += codepoint.byte_len; return .{ .codepoint = '\n' }; @@ -269,7 +295,65 @@ pub const IterativeStringParser = struct { backtrack = true; }, else => switch (self.declared_string_type) { - .wide => {}, // invalid escape sequences are skipped in wide strings + .wide => { + // All invalid escape sequences are skipped in wide strings, + // but there is a special case around \ where the \ + // is skipped but the tab character is processed. + // It's actually a bit weirder than that, though, since + // the preprocessor is the one that does the -> spaces + // conversion, so it goes something like this: + // + // Before preprocessing: L"\" + // After preprocessing: L"\ " + // + // So the parser only sees an escaped space character followed + // by some other number of spaces >= 0. + // + // However, our preprocessor keeps tab characters intact, so we emulate + // the above behavior by skipping the \ and then outputting one less + // space than normal for the character. + if (c == '\t') { + // Only warn about a tab getting converted to spaces once per string + if (self.diagnostics != null and !self.seen_tab) { + try self.diagnostics.?.diagnostics.append(ErrorDetails{ + .err = .tab_converted_to_spaces, + .type = .warning, + .code_page = self.code_page, + .token = self.diagnostics.?.token, + }); + try self.diagnostics.?.diagnostics.append(ErrorDetails{ + .err = .tab_converted_to_spaces, + .type = .note, + .code_page = self.code_page, + .token = self.diagnostics.?.token, + .print_source_line = false, + }); + self.seen_tab = true; + } + + const cols = columnsUntilTabStop(self.column, 8); + // If the tab character would only be converted to a single space, + // then we can just skip both the \ and the and move on. + if (cols > 1) { + self.num_pending_spaces = @intCast(cols - 2); + self.index += codepoint.byte_len; + return .{ .codepoint = ' ' }; + } + } + // There's a second special case when the codepoint would be encoded + // as a surrogate pair in UTF-16, as the escape 'applies' to the + // high surrogate pair only in this instance. This is a side-effect + // of the Win32 RC compiler preprocessor outputting UTF-16 and the + // compiler itself seemingly working on code units instead of code points + // in this particular instance. + // + // We emulate this behavior by emitting the codepoint, but with a marker + // that indicates that it needs to be handled specially. + if (c >= 0x10000 and c != code_pages.Codepoint.invalid) { + self.index += codepoint.byte_len; + return .{ .codepoint = c, .escaped_surrogate_pair = true }; + } + }, .ascii => { // we intentionally avoid incrementing self.index // to handle the current char in the next call, @@ -303,6 +387,9 @@ pub const IterativeStringParser = struct { }, .escaped_octal => switch (c) { '0'...'7' => { + // Note: We use wrapping arithmetic on a u16 here since there's been no observed + // string parsing scenario where an escaped integer with a value >= the u16 + // max is interpreted as anything but the truncated u16 value. string_escape_n *%= 8; string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable; string_escape_i += 1; @@ -367,7 +454,7 @@ pub const IterativeStringParser = struct { pub const StringParseOptions = struct { start_column: usize = 0, diagnostics: ?DiagnosticsContext = null, - output_code_page: CodePage = .windows1252, + output_code_page: SupportedCodePage, }; pub fn parseQuotedString( @@ -389,46 +476,52 @@ pub fn parseQuotedString( while (try iterative_parser.next()) |parsed| { const c = parsed.codepoint; - if (parsed.from_escaped_integer) { - // We truncate here to get the correct behavior for ascii strings - try buf.append(std.mem.nativeToLittle(T, @truncate(c))); - } else { - switch (literal_type) { - .ascii => switch (options.output_code_page) { - .windows1252 => { - if (windows1252.bestFitFromCodepoint(c)) |best_fit| { - try buf.append(best_fit); - } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { - try buf.append('?'); - } else { - try buf.appendSlice("??"); - } - }, - .utf8 => { - var codepoint_to_encode = c; - if (c == code_pages.Codepoint.invalid) { - codepoint_to_encode = '�'; - } - var utf8_buf: [4]u8 = undefined; - const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable; - try buf.appendSlice(utf8_buf[0..utf8_len]); - }, - else => unreachable, // Unsupported code page - }, - .wide => { - if (c == code_pages.Codepoint.invalid) { - try buf.append(std.mem.nativeToLittle(u16, '�')); - } else if (c < 0x10000) { - const short: u16 = @intCast(c); - try buf.append(std.mem.nativeToLittle(u16, short)); + switch (literal_type) { + .ascii => switch (options.output_code_page) { + .windows1252 => { + if (parsed.from_escaped_integer) { + try buf.append(@truncate(c)); + } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| { + try buf.append(best_fit); + } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) { + try buf.append('?'); } else { + try buf.appendSlice("??"); + } + }, + .utf8 => { + var codepoint_to_encode = c; + if (parsed.from_escaped_integer) { + codepoint_to_encode = @as(T, @truncate(c)); + } + const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F; + if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) { + codepoint_to_encode = '�'; + } + var utf8_buf: [4]u8 = undefined; + const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable; + try buf.appendSlice(utf8_buf[0..utf8_len]); + }, + }, + .wide => { + // Parsing any string type as a wide string is handled separately, see parseQuotedStringAsWideString + std.debug.assert(iterative_parser.declared_string_type == .wide); + if (parsed.from_escaped_integer) { + try buf.append(std.mem.nativeToLittle(u16, @truncate(c))); + } else if (c == code_pages.Codepoint.invalid) { + try buf.append(std.mem.nativeToLittle(u16, '�')); + } else if (c < 0x10000) { + const short: u16 = @intCast(c); + try buf.append(std.mem.nativeToLittle(u16, short)); + } else { + if (!parsed.escaped_surrogate_pair) { const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800; try buf.append(std.mem.nativeToLittle(u16, high)); - const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; - try buf.append(std.mem.nativeToLittle(u16, low)); } - }, - } + const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; + try buf.append(std.mem.nativeToLittle(u16, low)); + } + }, } } @@ -449,9 +542,59 @@ pub fn parseQuotedWideString(allocator: std.mem.Allocator, bytes: SourceBytes, o return parseQuotedString(.wide, allocator, bytes, options); } +/// Parses any string type into a wide string. +/// If the string is declared as a wide string (L""), then it is handled normally. +/// Otherwise, things are fairly normal with the exception of escaped integers. +/// Escaped integers are handled by: +/// - Truncating the escape to a u8 +/// - Reinterpeting the u8 as a byte from the *output* code page +/// - Outputting the codepoint that corresponds to the interpreted byte, or � if no such +/// interpretation is possible +/// For example, if the code page is UTF-8, then while \x80 is a valid start byte, it's +/// interpreted as a single byte, so it ends up being seen as invalid and � is outputted. +/// If the code page is Windows-1252, then \x80 is interpreted to be € which has the +/// codepoint U+20AC, so the UTF-16 encoding of U+20AC is outputted. pub fn parseQuotedStringAsWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 { std.debug.assert(bytes.slice.len >= 2); // "" - return parseQuotedString(.wide, allocator, bytes, options); + + if (bytes.slice[0] == 'l' or bytes.slice[0] == 'L') { + return parseQuotedWideString(allocator, bytes, options); + } + + // Note: We're only handling the case of parsing an ASCII string into a wide string from here on out. + // TODO: The logic below is similar to that in AcceleratorKeyCodepointTranslator, might be worth merging the two + + var buf = try std.ArrayList(u16).initCapacity(allocator, bytes.slice.len); + errdefer buf.deinit(); + + var iterative_parser = IterativeStringParser.init(bytes, options); + + while (try iterative_parser.next()) |parsed| { + const c = parsed.codepoint; + if (parsed.from_escaped_integer) { + std.debug.assert(c != code_pages.Codepoint.invalid); + const byte_to_interpret: u8 = @truncate(c); + const code_unit_to_encode: u16 = switch (options.output_code_page) { + .windows1252 => windows1252.toCodepoint(byte_to_interpret), + .utf8 => if (byte_to_interpret > 0x7F) '�' else byte_to_interpret, + }; + try buf.append(std.mem.nativeToLittle(u16, code_unit_to_encode)); + } else if (c == code_pages.Codepoint.invalid) { + try buf.append(std.mem.nativeToLittle(u16, '�')); + } else if (c < 0x10000) { + const short: u16 = @intCast(c); + try buf.append(std.mem.nativeToLittle(u16, short)); + } else { + if (!parsed.escaped_surrogate_pair) { + const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800; + try buf.append(std.mem.nativeToLittle(u16, high)); + } + const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00; + try buf.append(std.mem.nativeToLittle(u16, low)); + } + } + + return buf.toOwnedSliceSentinel(0); } test "parse quoted ascii string" { @@ -464,133 +607,155 @@ test "parse quoted ascii string" { \\"hello" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // hex with 0 digits try std.testing.expectEqualSlices(u8, "\x00", try parseQuotedAsciiString(arena, .{ .slice = \\"\x" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // hex max of 2 digits try std.testing.expectEqualSlices(u8, "\xFFf", try parseQuotedAsciiString(arena, .{ .slice = \\"\XfFf" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // octal with invalid octal digit try std.testing.expectEqualSlices(u8, "\x019", try parseQuotedAsciiString(arena, .{ .slice = \\"\19" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // escaped quotes try std.testing.expectEqualSlices(u8, " \" ", try parseQuotedAsciiString(arena, .{ .slice = \\" "" " , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // backslash right before escaped quotes try std.testing.expectEqualSlices(u8, "\"", try parseQuotedAsciiString(arena, .{ .slice = \\"\""" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // octal overflow try std.testing.expectEqualSlices(u8, "\x01", try parseQuotedAsciiString(arena, .{ .slice = \\"\401" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // escapes try std.testing.expectEqualSlices(u8, "\x08\n\r\t\\", try parseQuotedAsciiString(arena, .{ .slice = \\"\a\n\r\t\\" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // uppercase escapes try std.testing.expectEqualSlices(u8, "\x08\\N\\R\t\\", try parseQuotedAsciiString(arena, .{ .slice = \\"\A\N\R\T\\" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // backslash on its own try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(arena, .{ .slice = \\"\" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // unrecognized escapes try std.testing.expectEqualSlices(u8, "\\b", try parseQuotedAsciiString(arena, .{ .slice = \\"\b" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // escaped carriage returns try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString( arena, .{ .slice = "\"\\\r\r\r\r\r\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // escaped newlines try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( arena, .{ .slice = "\"\\\n\n\n\n\n\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // escaped CRLF pairs try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( arena, .{ .slice = "\"\\\r\n\r\n\r\n\r\n\r\n\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // escaped newlines with other whitespace try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( arena, .{ .slice = "\"\\\n \t\r\n \r\t\n \t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // literal tab characters get converted to spaces (dependent on source file columns) try std.testing.expectEqualSlices(u8, " ", try parseQuotedAsciiString( arena, .{ .slice = "\"\t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSlices(u8, "abc ", try parseQuotedAsciiString( arena, .{ .slice = "\"abc\t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSlices(u8, "abcdefg ", try parseQuotedAsciiString( arena, .{ .slice = "\"abcdefg\t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSlices(u8, "\\ ", try parseQuotedAsciiString( arena, .{ .slice = "\"\\\t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // literal CR's get dropped try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( arena, .{ .slice = "\"\r\r\r\r\r\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // contiguous newlines and whitespace get collapsed to try std.testing.expectEqualSlices(u8, " \n", try parseQuotedAsciiString( arena, .{ .slice = "\"\n\r\r \r\n \t \"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); } @@ -602,32 +767,32 @@ test "parse quoted ascii string with utf8 code page" { try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString( arena, .{ .slice = "\"\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Codepoints that don't have a Windows-1252 representation get converted to ? try std.testing.expectEqualSlices(u8, "?????????", try parseQuotedAsciiString( arena, .{ .slice = "\"кириллица\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Codepoints that have a best fit mapping get converted accordingly, // these are box drawing codepoints try std.testing.expectEqualSlices(u8, "\x2b\x2d\x2b", try parseQuotedAsciiString( arena, .{ .slice = "\"┌─┐\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Invalid UTF-8 gets converted to ? depending on well-formedness try std.testing.expectEqualSlices(u8, "????", try parseQuotedAsciiString( arena, .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Codepoints that would require a UTF-16 surrogate pair get converted to ?? try std.testing.expectEqualSlices(u8, "??", try parseQuotedAsciiString( arena, .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Output code page changes how invalid UTF-8 gets converted, since it @@ -652,6 +817,18 @@ test "parse quoted ascii string with utf8 code page" { )); } +test "parse quoted string with different input/output code pages" { + var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena_allocator.deinit(); + const arena = arena_allocator.allocator(); + + try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString( + arena, + .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 }, + .{ .output_code_page = .utf8 }, + )); +} + test "parse quoted wide string" { var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator); defer arena_allocator.deinit(); @@ -662,52 +839,62 @@ test "parse quoted wide string" { \\L"hello" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // hex with 0 digits try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{0x0}, try parseQuotedWideString(arena, .{ .slice = \\L"\x" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // hex max of 4 digits try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0xFFFF), std.mem.nativeToLittle(u16, 'f') }, try parseQuotedWideString(arena, .{ .slice = \\L"\XfFfFf" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // octal max of 7 digits try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x9493), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '3') }, try parseQuotedWideString(arena, .{ .slice = \\L"\111222333" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // octal overflow try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0xFF01)}, try parseQuotedWideString(arena, .{ .slice = \\L"\777401" , .code_page = .windows1252, - }, .{})); + }, .{ + .output_code_page = .windows1252, + })); // literal tab characters get converted to spaces (dependent on source file columns) try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("abcdefg "), try parseQuotedWideString( arena, .{ .slice = "L\"abcdefg\t\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Windows-1252 conversion try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("ðð€€€"), try parseQuotedWideString( arena, .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Invalid escape sequences are skipped try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedWideString( arena, .{ .slice = "L\"\\H\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); } @@ -719,18 +906,18 @@ test "parse quoted wide string with utf8 code page" { try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{}, try parseQuotedWideString( arena, .{ .slice = "L\"\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedWideString( arena, .{ .slice = "L\"кириллица\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Invalid UTF-8 gets converted to � depending on well-formedness try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("����"), try parseQuotedWideString( arena, .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); } @@ -742,29 +929,29 @@ test "parse quoted ascii string as wide string" { try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedStringAsWideString( arena, .{ .slice = "\"кириллица\"", .code_page = .utf8 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Whether or not invalid escapes are skipped is still determined by the L prefix try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("\\H"), try parseQuotedStringAsWideString( arena, .{ .slice = "\"\\H\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedStringAsWideString( arena, .{ .slice = "L\"\\H\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); // Maximum escape sequence value is also determined by the L prefix try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x12), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '4') }, try parseQuotedStringAsWideString( arena, .{ .slice = "\"\\x1234\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0x1234)}, try parseQuotedStringAsWideString( arena, .{ .slice = "L\"\\x1234\"", .code_page = .windows1252 }, - .{}, + .{ .output_code_page = .windows1252 }, )); } diff --git a/lib/compiler/resinator/main.zig b/lib/compiler/resinator/main.zig index 6973c6e6b592..abad1341ef54 100644 --- a/lib/compiler/resinator/main.zig +++ b/lib/compiler/resinator/main.zig @@ -7,6 +7,7 @@ const Diagnostics = @import("errors.zig").Diagnostics; const cli = @import("cli.zig"); const preprocess = @import("preprocess.zig"); const renderErrorMessage = @import("utils.zig").renderErrorMessage; +const hasDisjointCodePage = @import("disjoint_code_page.zig").hasDisjointCodePage; const aro = @import("aro"); pub fn main() !void { @@ -179,16 +180,30 @@ pub fn main() !void { // Note: We still want to run this when no-preprocess is set because: // 1. We want to print accurate line numbers after removing multiline comments // 2. We want to be able to handle an already-preprocessed input with #line commands in it - var mapping_results = try parseAndRemoveLineCommands(allocator, full_input, full_input, .{ .initial_filename = options.input_filename }); - defer mapping_results.mappings.deinit(allocator); - - const final_input = removeComments(mapping_results.result, mapping_results.result, &mapping_results.mappings) catch |err| switch (err) { - error.InvalidSourceMappingCollapse => { - try error_handler.emitMessage(allocator, .err, "failed during comment removal; this is a known bug", .{}); + var mapping_results = parseAndRemoveLineCommands(allocator, full_input, full_input, .{ .initial_filename = options.input_filename }) catch |err| switch (err) { + error.InvalidLineCommand => { + // TODO: Maybe output the invalid line command + try renderErrorMessage(stderr.writer(), stderr_config, .err, "invalid line command in the preprocessed source", .{}); + if (options.preprocess == .no) { + try renderErrorMessage(stderr.writer(), stderr_config, .note, "line commands must be of the format: #line \"\"", .{}); + } else { + try renderErrorMessage(stderr.writer(), stderr_config, .note, "this is likely to be a bug, please report it", .{}); + } std.process.exit(1); }, - else => |e| return e, + error.LineNumberOverflow => { + // TODO: Better error message + try renderErrorMessage(stderr.writer(), stderr_config, .err, "line number count exceeded maximum of {}", .{std.math.maxInt(usize)}); + std.process.exit(1); + }, + error.OutOfMemory => |e| return e, }; + defer mapping_results.mappings.deinit(allocator); + + const default_code_page = options.default_code_page orelse .windows1252; + const has_disjoint_code_page = hasDisjointCodePage(mapping_results.result, &mapping_results.mappings, default_code_page); + + const final_input = try removeComments(mapping_results.result, mapping_results.result, &mapping_results.mappings); var output_file = std.fs.cwd().createFile(options.output_filename, .{}) catch |err| { try error_handler.emitMessage(allocator, .err, "unable to create output file '{s}': {s}", .{ options.output_filename, @errorName(err) }); @@ -211,7 +226,8 @@ pub fn main() !void { .extra_include_paths = options.extra_include_paths.items, .system_include_paths = include_paths, .default_language_id = options.default_language_id, - .default_code_page = options.default_code_page orelse .windows1252, + .default_code_page = default_code_page, + .disjoint_code_page = has_disjoint_code_page, .verbose = options.verbose, .null_terminate_string_table_strings = options.null_terminate_string_table_strings, .max_string_literal_codepoints = options.max_string_literal_codepoints, @@ -513,7 +529,7 @@ fn diagnosticsToErrorBundle( }; if (err_details.print_source_line) { const source_line = err_details.token.getLineForErrorDisplay(source, source_line_start); - const visual_info = err_details.visualTokenInfo(source_line_start, source_line_start + source_line.len); + const visual_info = err_details.visualTokenInfo(source_line_start, source_line_start + source_line.len, source); src_loc.span_start = @intCast(visual_info.point_offset - visual_info.before_len); src_loc.span_main = @intCast(visual_info.point_offset); src_loc.span_end = @intCast(visual_info.point_offset + 1 + visual_info.after_len); diff --git a/lib/compiler/resinator/parse.zig b/lib/compiler/resinator/parse.zig index 3bfd7fd7e257..6b9ba9368724 100644 --- a/lib/compiler/resinator/parse.zig +++ b/lib/compiler/resinator/parse.zig @@ -4,9 +4,10 @@ const Token = @import("lex.zig").Token; const Node = @import("ast.zig").Node; const Tree = @import("ast.zig").Tree; const CodePageLookup = @import("ast.zig").CodePageLookup; -const Resource = @import("rc.zig").Resource; +const ResourceType = @import("rc.zig").ResourceType; const Allocator = std.mem.Allocator; const ErrorDetails = @import("errors.zig").ErrorDetails; +const ErrorDetailsWithoutCodePage = @import("errors.zig").ErrorDetailsWithoutCodePage; const Diagnostics = @import("errors.zig").Diagnostics; const SourceBytes = @import("literals.zig").SourceBytes; const Compiler = @import("compile.zig").Compiler; @@ -30,6 +31,7 @@ pub const Parser = struct { pub const Options = struct { warn_instead_of_error_on_invalid_code_page: bool = false, + disjoint_code_page: bool = false, }; pub fn init(lexer: *Lexer, options: Options) Parser { @@ -47,6 +49,7 @@ pub const Parser = struct { diagnostics: *Diagnostics, input_code_page_lookup: CodePageLookup, output_code_page_lookup: CodePageLookup, + warned_about_disjoint_code_page: bool, }; pub fn parse(self: *Self, allocator: Allocator, diagnostics: *Diagnostics) Error!*Tree { @@ -61,6 +64,7 @@ pub const Parser = struct { .diagnostics = diagnostics, .input_code_page_lookup = CodePageLookup.init(arena.allocator(), self.lexer.default_code_page), .output_code_page_lookup = CodePageLookup.init(arena.allocator(), self.lexer.default_code_page), + .warned_about_disjoint_code_page = false, }; const parsed_root = try self.parseRoot(); @@ -116,7 +120,7 @@ pub const Parser = struct { const maybe_common_resource_attribute = try self.lookaheadToken(.normal); if (maybe_common_resource_attribute.id == .literal and rc.CommonResourceAttributes.map.has(maybe_common_resource_attribute.slice(self.lexer.buffer))) { try common_resource_attributes.append(self.state.arena, maybe_common_resource_attribute); - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); } else { break; } @@ -130,8 +134,13 @@ pub const Parser = struct { /// optional statements (if any). If there are no optional statements, the /// current token is unchanged. /// The returned slice is allocated by the parser's arena - fn parseOptionalStatements(self: *Self, resource: Resource) ![]*Node { + fn parseOptionalStatements(self: *Self, resource: ResourceType) ![]*Node { var optional_statements: std.ArrayListUnmanaged(*Node) = .empty; + + const num_statement_types = @typeInfo(rc.OptionalStatements).@"enum".fields.len; + var statement_type_has_duplicates = [_]bool{false} ** num_statement_types; + var last_statement_per_type = [_]?*Node{null} ** num_statement_types; + while (true) { const lookahead_token = try self.lookaheadToken(.normal); if (lookahead_token.id != .literal) break; @@ -140,7 +149,13 @@ pub const Parser = struct { .dialog, .dialogex => rc.OptionalStatements.dialog_map.get(slice) orelse break, else => break, }; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); + + const type_i = @intFromEnum(optional_statement_type); + if (last_statement_per_type[type_i] != null) { + statement_type_has_duplicates[type_i] = true; + } + switch (optional_statement_type) { .language => { const language = try self.parseLanguageStatement(); @@ -166,7 +181,7 @@ pub const Parser = struct { try self.nextToken(.normal); const value = self.state.token; if (!value.isStringLiteral()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = value, .extra = .{ .expected_types = .{ @@ -223,7 +238,7 @@ pub const Parser = struct { try self.nextToken(.normal); const typeface = self.state.token; if (!typeface.isStringLiteral()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = typeface, .extra = .{ .expected_types = .{ @@ -272,7 +287,42 @@ pub const Parser = struct { try optional_statements.append(self.state.arena, &node.base); }, } + + last_statement_per_type[type_i] = optional_statements.items[optional_statements.items.len - 1]; } + + for (optional_statements.items) |optional_statement| { + const type_i = type_i: { + switch (optional_statement.id) { + .simple_statement => { + const simple_statement: *Node.SimpleStatement = @alignCast(@fieldParentPtr("base", optional_statement)); + const statement_identifier = simple_statement.identifier; + const slice = statement_identifier.slice(self.lexer.buffer); + const optional_statement_type = rc.OptionalStatements.map.get(slice) orelse + rc.OptionalStatements.dialog_map.get(slice).?; + break :type_i @intFromEnum(optional_statement_type); + }, + .font_statement => { + break :type_i @intFromEnum(rc.OptionalStatements.font); + }, + .language_statement => { + break :type_i @intFromEnum(rc.OptionalStatements.language); + }, + else => unreachable, + } + }; + if (!statement_type_has_duplicates[type_i]) continue; + if (optional_statement == last_statement_per_type[type_i].?) continue; + + try self.addErrorDetails(.{ + .err = .duplicate_optional_statement_skipped, + .type = .warning, + .token = optional_statement.getFirstToken(), + .token_span_start = optional_statement.getFirstToken(), + .token_span_end = optional_statement.getLastToken(), + }); + } + return optional_statements.toOwnedSlice(self.state.arena); } @@ -311,12 +361,13 @@ pub const Parser = struct { const maybe_end_token = try self.lookaheadToken(.normal); switch (maybe_end_token.id) { .end => { - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); break; }, .eof => { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsWithCodePageAndFail(.{ .err = .unfinished_string_table_block, + .code_page = self.lexer.current_code_page, .token = maybe_end_token, }); }, @@ -328,7 +379,7 @@ pub const Parser = struct { try self.nextToken(.normal); if (self.state.token.id != .quoted_ascii_string and self.state.token.id != .quoted_wide_string) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = self.state.token, .extra = .{ .expected_types = .{ .string_literal = true } }, @@ -345,7 +396,7 @@ pub const Parser = struct { } if (strings.items.len == 0) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_token, // TODO: probably a more specific error message .token = self.state.token, .extra = .{ .expected = .number }, @@ -374,7 +425,12 @@ pub const Parser = struct { // of projects. So, we have special compatibility for this particular case. const maybe_eof = try self.lookaheadToken(.whitespace_delimiter_only); if (maybe_eof.id == .eof) { - // TODO: emit warning + try self.addErrorDetails(.{ + .err = .dangling_literal_at_eof, + .type = .warning, + .token = first_token, + }); + var context = try self.state.arena.alloc(Token, 2); context[0] = first_token; context[1] = maybe_eof; @@ -413,12 +469,12 @@ pub const Parser = struct { if (maybe_ordinal == null) { const would_be_win32_rc_ordinal = res.NameOrOrdinal.maybeNonAsciiOrdinalFromString(id_bytes); if (would_be_win32_rc_ordinal) |win32_rc_ordinal| { - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .id_must_be_ordinal, .token = id_token, .extra = .{ .resource = resource }, }); - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .win32_non_ascii_ordinal, .token = id_token, .type = .note, @@ -426,7 +482,7 @@ pub const Parser = struct { .extra = .{ .number = win32_rc_ordinal.ordinal }, }); } else { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .id_must_be_ordinal, .token = id_token, .extra = .{ .resource = resource }, @@ -451,7 +507,7 @@ pub const Parser = struct { const lookahead = try self.lookaheadToken(.normal); switch (lookahead.id) { .end, .eof => { - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); break; }, else => {}, @@ -739,19 +795,19 @@ pub const Parser = struct { const maybe_begin = try self.lookaheadToken(.normal); if (maybe_begin.id == .begin) { - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); if (!resource.canUseRawData()) { - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .resource_type_cant_use_raw_data, - .token = maybe_begin, + .token = self.state.token, .extra = .{ .resource = resource }, }); - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .resource_type_cant_use_raw_data, .type = .note, .print_source_line = false, - .token = maybe_begin, + .token = self.state.token, }); } @@ -802,11 +858,12 @@ pub const Parser = struct { const maybe_end_token = try self.lookaheadToken(.normal); switch (maybe_end_token.id) { .comma => { + try self.nextToken(.normal); // comma as the first token in a raw data block is an error if (raw_data.items.len == 0) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, - .token = maybe_end_token, + .token = self.state.token, .extra = .{ .expected_types = .{ .number = true, .number_expression = true, @@ -815,16 +872,16 @@ pub const Parser = struct { }); } // otherwise just skip over commas - self.nextToken(.normal) catch unreachable; continue; }, .end => { - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); break; }, .eof => { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsWithCodePageAndFail(.{ .err = .unfinished_raw_data_block, + .code_page = self.lexer.current_code_page, .token = maybe_end_token, }); }, @@ -836,10 +893,12 @@ pub const Parser = struct { if (expression.isNumberExpression()) { const maybe_close_paren = try self.lookaheadToken(.normal); if (maybe_close_paren.id == .close_paren) { + // advance to ensure that the code page lookup is populated for this token + try self.nextToken(.normal); // ) is an error - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_token, - .token = maybe_close_paren, + .token = self.state.token, .extra = .{ .expected = .operator }, }); } @@ -852,10 +911,10 @@ pub const Parser = struct { /// begin on the next token. /// After return, the current token will be the token immediately before the end of the /// control statement (or unchanged if the function returns null). - fn parseControlStatement(self: *Self, resource: Resource) Error!?*Node { + fn parseControlStatement(self: *Self, resource: ResourceType) Error!?*Node { const control_token = try self.lookaheadToken(.normal); const control = rc.Control.map.get(control_token.slice(self.lexer.buffer)) orelse return null; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); try self.skipAnyCommas(); @@ -867,7 +926,7 @@ pub const Parser = struct { text = self.state.token; }, else => { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = self.state.token, .extra = .{ .expected_types = .{ @@ -920,14 +979,16 @@ pub const Parser = struct { // the style parameter. const lookahead_token = try self.lookaheadToken(.normal); if (lookahead_token.id != .comma and lookahead_token.id != .eof) { - try self.addErrorDetails(.{ + try self.addErrorDetailsWithCodePage(.{ .err = .rc_could_miscompile_control_params, .type = .warning, + .code_page = self.lexer.current_code_page, .token = lookahead_token, }); - try self.addErrorDetails(.{ + try self.addErrorDetailsWithCodePage(.{ .err = .rc_could_miscompile_control_params, .type = .note, + .code_page = self.lexer.current_code_page, .token = style.?.getFirstToken(), .token_span_end = style.?.getLastToken(), }); @@ -987,7 +1048,7 @@ pub const Parser = struct { fn parseToolbarButtonStatement(self: *Self) Error!?*Node { const keyword_token = try self.lookaheadToken(.normal); const button_type = rc.ToolbarButton.map.get(keyword_token.slice(self.lexer.buffer)) orelse return null; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); switch (button_type) { .separator => { @@ -1014,10 +1075,10 @@ pub const Parser = struct { /// begin on the next token. /// After return, the current token will be the token immediately before the end of the /// menuitem statement (or unchanged if the function returns null). - fn parseMenuItemStatement(self: *Self, resource: Resource, top_level_menu_id_token: Token, nesting_level: u32) Error!?*Node { + fn parseMenuItemStatement(self: *Self, resource: ResourceType, top_level_menu_id_token: Token, nesting_level: u32) Error!?*Node { const menuitem_token = try self.lookaheadToken(.normal); const menuitem = rc.MenuItem.map.get(menuitem_token.slice(self.lexer.buffer)) orelse return null; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); if (nesting_level > max_nested_menu_level) { try self.addErrorDetails(.{ @@ -1050,7 +1111,7 @@ pub const Parser = struct { } else { const text = self.state.token; if (!text.isStringLiteral()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = text, .extra = .{ .expected_types = .{ @@ -1070,7 +1131,7 @@ pub const Parser = struct { if (!rc.MenuItem.Option.map.has(option_token.slice(self.lexer.buffer))) { break; } - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); try options.append(self.state.arena, option_token); try self.skipAnyCommas(); } @@ -1089,7 +1150,7 @@ pub const Parser = struct { try self.nextToken(.normal); const text = self.state.token; if (!text.isStringLiteral()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = text, .extra = .{ .expected_types = .{ @@ -1105,7 +1166,7 @@ pub const Parser = struct { if (!rc.MenuItem.Option.map.has(option_token.slice(self.lexer.buffer))) { break; } - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); try options.append(self.state.arena, option_token); try self.skipAnyCommas(); } @@ -1146,7 +1207,7 @@ pub const Parser = struct { try self.nextToken(.normal); const text = self.state.token; if (!text.isStringLiteral()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = text, .extra = .{ .expected_types = .{ @@ -1257,7 +1318,7 @@ pub const Parser = struct { fn parseVersionStatement(self: *Self) Error!?*Node { const type_token = try self.lookaheadToken(.normal); const statement_type = rc.VersionInfo.map.get(type_token.slice(self.lexer.buffer)) orelse return null; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); switch (statement_type) { .file_version, .product_version => { var parts_buffer: [4]*Node = undefined; @@ -1301,7 +1362,7 @@ pub const Parser = struct { fn parseVersionBlockOrValue(self: *Self, top_level_version_id_token: Token, nesting_level: u32) Error!?*Node { const keyword_token = try self.lookaheadToken(.normal); const keyword = rc.VersionBlock.map.get(keyword_token.slice(self.lexer.buffer)) orelse return null; - self.nextToken(.normal) catch unreachable; + try self.nextToken(.normal); if (nesting_level > max_nested_version_level) { try self.addErrorDetails(.{ @@ -1541,7 +1602,7 @@ pub const Parser = struct { } }; - pub fn toErrorDetails(options: ParseExpressionOptions, token: Token) ErrorDetails { + pub fn toErrorDetails(options: ParseExpressionOptions, token: Token) ErrorDetailsWithoutCodePage { // TODO: expected_types_override interaction with is_known_to_be_number_expression? const expected_types = options.expected_types_override orelse ErrorDetails.ExpectedTypes{ .number = options.allowed_types.number, @@ -1549,7 +1610,7 @@ pub const Parser = struct { .string_literal = options.allowed_types.string and !options.is_known_to_be_number_expression, .literal = options.allowed_types.literal and !options.is_known_to_be_number_expression, }; - return ErrorDetails{ + return .{ .err = .expected_something_else, .token = token, .extra = .{ .expected_types = expected_types }, @@ -1690,7 +1751,7 @@ pub const Parser = struct { try self.addErrorDetails(options.toErrorDetails(self.state.token)); if (is_close_paren_expression) { - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .close_paren_expression, .type = .note, .token = self.state.token, @@ -1698,7 +1759,7 @@ pub const Parser = struct { }); } if (is_unary_plus_expression) { - try self.addErrorDetails(ErrorDetails{ + try self.addErrorDetails(.{ .err = .unary_plus_expression, .type = .note, .token = self.state.token, @@ -1739,7 +1800,7 @@ pub const Parser = struct { }); if (!rhs_node.isNumberExpression()) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_something_else, .token = rhs_node.getFirstToken(), .token_span_end = rhs_node.getLastToken(), @@ -1781,16 +1842,39 @@ pub const Parser = struct { fn parseOptionalTokenAdvanced(self: *Self, id: Token.Id, comptime method: Lexer.LexMethod) Error!bool { const maybe_token = try self.lookaheadToken(method); if (maybe_token.id != id) return false; - self.nextToken(method) catch unreachable; + try self.nextToken(method); return true; } - fn addErrorDetails(self: *Self, details: ErrorDetails) Allocator.Error!void { + fn addErrorDetailsWithCodePage(self: *Self, details: ErrorDetails) Allocator.Error!void { try self.state.diagnostics.append(details); } - fn addErrorDetailsAndFail(self: *Self, details: ErrorDetails) Error { - try self.addErrorDetails(details); + fn addErrorDetailsWithCodePageAndFail(self: *Self, details: ErrorDetails) Error { + try self.addErrorDetailsWithCodePage(details); + return error.ParseError; + } + + /// Code page is looked up in input_code_page_lookup using the token, meaning the token + /// must come from nextToken (i.e. it can't be a lookahead token). + fn addErrorDetails(self: *Self, details_without_code_page: ErrorDetailsWithoutCodePage) Allocator.Error!void { + const details = ErrorDetails{ + .err = details_without_code_page.err, + .code_page = self.state.input_code_page_lookup.getForToken(details_without_code_page.token), + .token = details_without_code_page.token, + .token_span_start = details_without_code_page.token_span_start, + .token_span_end = details_without_code_page.token_span_end, + .type = details_without_code_page.type, + .print_source_line = details_without_code_page.print_source_line, + .extra = details_without_code_page.extra, + }; + try self.addErrorDetailsWithCodePage(details); + } + + /// Code page is looked up in input_code_page_lookup using the token, meaning the token + /// must come from nextToken (i.e. it can't be a lookahead token). + fn addErrorDetailsAndFail(self: *Self, details_without_code_page: ErrorDetailsWithoutCodePage) Error { + try self.addErrorDetails(details_without_code_page); return error.ParseError; } @@ -1798,35 +1882,34 @@ pub const Parser = struct { self.state.token = token: while (true) { const token = self.lexer.next(method) catch |err| switch (err) { error.CodePagePragmaInIncludedFile => { - // The Win32 RC compiler silently ignores such `#pragma code_point` directives, + // The Win32 RC compiler silently ignores such `#pragma code_page` directives, // but we want to both ignore them *and* emit a warning - try self.addErrorDetails(.{ - .err = .code_page_pragma_in_included_file, - .type = .warning, - .token = self.lexer.error_context_token.?, - }); + var details = self.lexer.getErrorDetails(err); + details.type = .warning; + try self.addErrorDetailsWithCodePage(details); continue; }, error.CodePagePragmaInvalidCodePage => { var details = self.lexer.getErrorDetails(err); if (!self.options.warn_instead_of_error_on_invalid_code_page) { - return self.addErrorDetailsAndFail(details); + return self.addErrorDetailsWithCodePageAndFail(details); } details.type = .warning; - try self.addErrorDetails(details); + try self.addErrorDetailsWithCodePage(details); continue; }, error.InvalidDigitCharacterInNumberLiteral => { const details = self.lexer.getErrorDetails(err); - try self.addErrorDetails(details); - return self.addErrorDetailsAndFail(.{ + try self.addErrorDetailsWithCodePage(details); + return self.addErrorDetailsWithCodePageAndFail(.{ .err = details.err, .type = .note, + .code_page = self.lexer.current_code_page, .token = details.token, .print_source_line = false, }); }, - else => return self.addErrorDetailsAndFail(self.lexer.getErrorDetails(err)), + else => return self.addErrorDetailsWithCodePageAndFail(self.lexer.getErrorDetails(err)), }; break :token token; }; @@ -1835,7 +1918,29 @@ pub const Parser = struct { // But only set the output code page to the current code page if we are past the first code_page pragma in the file. // Otherwise, we want to fill the lookup using the default code page so that lookups still work for lines that // don't have an explicit output code page set. - const output_code_page = if (self.lexer.seen_pragma_code_pages > 1) self.lexer.current_code_page else self.state.output_code_page_lookup.default_code_page; + const is_disjoint_code_page = self.options.disjoint_code_page and self.lexer.seen_pragma_code_pages == 1; + const output_code_page = if (is_disjoint_code_page) + self.state.output_code_page_lookup.default_code_page + else + self.lexer.current_code_page; + + if (is_disjoint_code_page and !self.state.warned_about_disjoint_code_page) { + try self.addErrorDetailsWithCodePage(.{ + .err = .disjoint_code_page, + .type = .warning, + .code_page = self.state.input_code_page_lookup.getForLineNum(self.lexer.last_pragma_code_page_token.?.line_number), + .token = self.lexer.last_pragma_code_page_token.?, + }); + try self.addErrorDetailsWithCodePage(.{ + .err = .disjoint_code_page, + .type = .note, + .code_page = self.state.input_code_page_lookup.getForLineNum(self.lexer.last_pragma_code_page_token.?.line_number), + .token = self.lexer.last_pragma_code_page_token.?, + .print_source_line = false, + }); + self.state.warned_about_disjoint_code_page = true; + } + try self.state.output_code_page_lookup.setForToken(self.state.token, output_code_page); } @@ -1846,7 +1951,7 @@ pub const Parser = struct { // Ignore this error and get the next valid token, we'll deal with this // properly when getting the token for real error.CodePagePragmaInIncludedFile => continue, - else => return self.addErrorDetailsAndFail(self.state.lookahead_lexer.getErrorDetails(err)), + else => return self.addErrorDetailsWithCodePageAndFail(self.state.lookahead_lexer.getErrorDetails(err)), }; }; } @@ -1860,7 +1965,7 @@ pub const Parser = struct { switch (self.state.token.id) { .literal => {}, else => { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_token, .token = self.state.token, .extra = .{ .expected = .literal }, @@ -1871,7 +1976,7 @@ pub const Parser = struct { fn check(self: *Self, expected_token_id: Token.Id) !void { if (self.state.token.id != expected_token_id) { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_token, .token = self.state.token, .extra = .{ .expected = expected_token_id }, @@ -1879,14 +1984,14 @@ pub const Parser = struct { } } - fn checkResource(self: *Self) !Resource { + fn checkResource(self: *Self) !ResourceType { switch (self.state.token.id) { - .literal => return Resource.fromString(.{ + .literal => return ResourceType.fromString(.{ .slice = self.state.token.slice(self.lexer.buffer), .code_page = self.lexer.current_code_page, }), else => { - return self.addErrorDetailsAndFail(ErrorDetails{ + return self.addErrorDetailsAndFail(.{ .err = .expected_token, .token = self.state.token, .extra = .{ .expected = .literal }, diff --git a/lib/compiler/resinator/preprocess.zig b/lib/compiler/resinator/preprocess.zig index c5b4b1edea54..e548683ef1bd 100644 --- a/lib/compiler/resinator/preprocess.zig +++ b/lib/compiler/resinator/preprocess.zig @@ -96,6 +96,7 @@ pub fn appendAroArgs(arena: Allocator, argv: *std.ArrayList([]const u8), options "--emulate=msvc", "-nostdinc", "-DRC_INVOKED", + "-D_WIN32", // undocumented, but defined by default }); for (options.extra_include_paths.items) |extra_include_path| { try argv.append("-I"); diff --git a/lib/compiler/resinator/rc.zig b/lib/compiler/resinator/rc.zig index a434e26c80ec..753decb1a7e1 100644 --- a/lib/compiler/resinator/rc.zig +++ b/lib/compiler/resinator/rc.zig @@ -5,7 +5,7 @@ const SourceBytes = @import("literals.zig").SourceBytes; // https://learn.microsoft.com/en-us/windows/win32/menurc/about-resource-files -pub const Resource = enum { +pub const ResourceType = enum { accelerators, bitmap, cursor, @@ -48,7 +48,7 @@ pub const Resource = enum { manifest_num, const map = std.StaticStringMapWithEql( - Resource, + ResourceType, std.static_string_map.eqlAsciiIgnoreCase, ).initComptime(.{ .{ "ACCELERATORS", .accelerators }, @@ -72,7 +72,7 @@ pub const Resource = enum { .{ "VXD", .vxd }, }); - pub fn fromString(bytes: SourceBytes) Resource { + pub fn fromString(bytes: SourceBytes) ResourceType { const maybe_ordinal = res.NameOrOrdinal.maybeOrdinalFromString(bytes); if (maybe_ordinal) |ordinal| { if (ordinal.ordinal >= 256) return .user_defined; @@ -81,8 +81,8 @@ pub const Resource = enum { return map.get(bytes.slice) orelse .user_defined; } - // TODO: Some comptime validation that RT <-> Resource conversion is synced? - pub fn fromRT(rt: res.RT) Resource { + // TODO: Some comptime validation that RT <-> ResourceType conversion is synced? + pub fn fromRT(rt: res.RT) ResourceType { return switch (rt) { .ACCELERATOR => .accelerators, .ANICURSOR => .anicursor_num, @@ -111,7 +111,7 @@ pub const Resource = enum { }; } - pub fn canUseRawData(resource: Resource) bool { + pub fn canUseRawData(resource: ResourceType) bool { return switch (resource) { .user_defined, .html, @@ -125,7 +125,7 @@ pub const Resource = enum { }; } - pub fn nameForErrorDisplay(resource: Resource) []const u8 { + pub fn nameForErrorDisplay(resource: ResourceType) []const u8 { return switch (resource) { // zig fmt: off .accelerators, .bitmap, .cursor, .dialog, .dialogex, .dlginclude, .dlginit, .font, diff --git a/lib/compiler/resinator/res.zig b/lib/compiler/resinator/res.zig index 991e0b8fb8a1..37475b308180 100644 --- a/lib/compiler/resinator/res.zig +++ b/lib/compiler/resinator/res.zig @@ -1,10 +1,10 @@ const std = @import("std"); const rc = @import("rc.zig"); -const Resource = rc.Resource; +const ResourceType = rc.ResourceType; const CommonResourceAttributes = rc.CommonResourceAttributes; const Allocator = std.mem.Allocator; const windows1252 = @import("windows1252.zig"); -const CodePage = @import("code_pages.zig").CodePage; +const SupportedCodePage = @import("code_pages.zig").SupportedCodePage; const literals = @import("literals.zig"); const SourceBytes = literals.SourceBytes; const Codepoint = @import("code_pages.zig").Codepoint; @@ -40,7 +40,7 @@ pub const RT = enum(u8) { /// Returns null if the resource type is user-defined /// Asserts that the resource is not `stringtable` - pub fn fromResource(resource: Resource) ?RT { + pub fn fromResource(resource: ResourceType) ?RT { return switch (resource) { .accelerators => .ACCELERATOR, .bitmap => .BITMAP, @@ -162,6 +162,27 @@ pub const Language = packed struct(u16) { pub fn asInt(self: Language) u16 { return @bitCast(self); } + + pub fn format( + language: Language, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + out_stream: anytype, + ) !void { + _ = fmt; + _ = options; + const language_id = language.asInt(); + const language_name = language_name: { + if (std.meta.intToEnum(lang.LanguageId, language_id)) |lang_enum_val| { + break :language_name @tagName(lang_enum_val); + } else |_| {} + if (language_id == lang.LOCALE_CUSTOM_UNSPECIFIED) { + break :language_name "LOCALE_CUSTOM_UNSPECIFIED"; + } + break :language_name ""; + }; + try out_stream.print("{s} (0x{X})", .{ language_name, language_id }); + } }; /// https://learn.microsoft.com/en-us/windows/win32/api/winuser/ns-winuser-dlgitemtemplate#remarks @@ -423,6 +444,50 @@ pub const NameOrOrdinal = union(enum) { .name => return null, } } + + pub fn format( + self: NameOrOrdinal, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + out_stream: anytype, + ) !void { + _ = fmt; + _ = options; + switch (self) { + .name => |name| { + try out_stream.print("{s}", .{std.unicode.fmtUtf16Le(name)}); + }, + .ordinal => |ordinal| { + try out_stream.print("{d}", .{ordinal}); + }, + } + } + + fn formatResourceType( + self: NameOrOrdinal, + comptime fmt: []const u8, + options: std.fmt.FormatOptions, + out_stream: anytype, + ) !void { + _ = fmt; + _ = options; + switch (self) { + .name => |name| { + try out_stream.print("{s}", .{std.unicode.fmtUtf16Le(name)}); + }, + .ordinal => |ordinal| { + if (std.enums.tagName(RT, @enumFromInt(ordinal))) |predefined_type_name| { + try out_stream.print("{s}", .{predefined_type_name}); + } else { + try out_stream.print("{d}", .{ordinal}); + } + }, + } + } + + pub fn fmtResourceType(type_value: NameOrOrdinal) std.fmt.Formatter(formatResourceType) { + return .{ .data = type_value }; + } }; fn expectNameOrOrdinal(expected: NameOrOrdinal, actual: NameOrOrdinal) !void { @@ -603,12 +668,33 @@ pub const AcceleratorModifiers = struct { const AcceleratorKeyCodepointTranslator = struct { string_type: literals.StringType, + output_code_page: SupportedCodePage, pub fn translate(self: @This(), maybe_parsed: ?literals.IterativeStringParser.ParsedCodepoint) ?u21 { const parsed = maybe_parsed orelse return null; if (parsed.codepoint == Codepoint.invalid) return 0xFFFD; - if (parsed.from_escaped_integer and self.string_type == .ascii) { - return windows1252.toCodepoint(@truncate(parsed.codepoint)); + if (parsed.from_escaped_integer) { + switch (self.string_type) { + .ascii => { + const truncated: u8 = @truncate(parsed.codepoint); + switch (self.output_code_page) { + .utf8 => switch (truncated) { + 0...0x7F => return truncated, + else => return 0xFFFD, + }, + .windows1252 => return windows1252.toCodepoint(truncated), + } + }, + .wide => { + const truncated: u16 = @truncate(parsed.codepoint); + return truncated; + }, + } + } + if (parsed.escaped_surrogate_pair) { + // The codepoint of only the low surrogate + const low = @as(u16, @intCast(parsed.codepoint & 0x3FF)) + 0xDC00; + return low; } return parsed.codepoint; } @@ -623,14 +709,17 @@ pub fn parseAcceleratorKeyString(bytes: SourceBytes, is_virt: bool, options: lit } var parser = literals.IterativeStringParser.init(bytes, options); - var translator = AcceleratorKeyCodepointTranslator{ .string_type = parser.declared_string_type }; + var translator = AcceleratorKeyCodepointTranslator{ + .string_type = parser.declared_string_type, + .output_code_page = options.output_code_page, + }; const first_codepoint = translator.translate(try parser.next()) orelse return error.EmptyAccelerator; // 0 is treated as a terminator, so this is equivalent to an empty string if (first_codepoint == 0) return error.EmptyAccelerator; if (first_codepoint == '^') { - // Note: Emitting this warning unconditonally whenever ^ is the first character + // Note: Emitting this warning unconditionally whenever ^ is the first character // matches the Win32 RC behavior, but it's questionable whether or not // the warning should be emitted for ^^ since that results in the ASCII // character ^ being written to the .res. @@ -638,11 +727,18 @@ pub fn parseAcceleratorKeyString(bytes: SourceBytes, is_virt: bool, options: lit try options.diagnostics.?.diagnostics.append(.{ .err = .ascii_character_not_equivalent_to_virtual_key_code, .type = .warning, + .code_page = bytes.code_page, .token = options.diagnostics.?.token, }); } const c = translator.translate(try parser.next()) orelse return error.InvalidControlCharacter; + + const third_codepoint = translator.translate(try parser.next()); + // 0 is treated as a terminator, so a 0 in the third position is fine but + // anything else is too many codepoints for an accelerator + if (third_codepoint != null and third_codepoint.? != 0) return error.InvalidControlCharacter; + switch (c) { '^' => return '^', // special case 'a'...'z', 'A'...'Z' => return std.ascii.toUpper(@intCast(c)) - 0x40, @@ -699,44 +795,44 @@ test "accelerator keys" { try std.testing.expectEqual(@as(u16, 1), try parseAcceleratorKeyString( .{ .slice = "\"^a\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 1), try parseAcceleratorKeyString( .{ .slice = "\"^A\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 26), try parseAcceleratorKeyString( .{ .slice = "\"^Z\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, '^'), try parseAcceleratorKeyString( .{ .slice = "\"^^\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 'a'), try parseAcceleratorKeyString( .{ .slice = "\"a\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0x6162), try parseAcceleratorKeyString( .{ .slice = "\"ab\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 'C'), try parseAcceleratorKeyString( .{ .slice = "\"c\"", .code_page = .windows1252 }, true, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0x6363), try parseAcceleratorKeyString( .{ .slice = "\"cc\"", .code_page = .windows1252 }, true, - .{}, + .{ .output_code_page = .windows1252 }, )); // \x00 or any escape that evaluates to zero acts as a terminator, everything past it @@ -744,93 +840,93 @@ test "accelerator keys" { try std.testing.expectEqual(@as(u16, 'a'), try parseAcceleratorKeyString( .{ .slice = "\"a\\0bcdef\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // \x80 is € in Windows-1252, which is Unicode codepoint 20AC try std.testing.expectEqual(@as(u16, 0x20AC), try parseAcceleratorKeyString( .{ .slice = "\"\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // This depends on the code page, though, with codepage 65001, \x80 // on its own is invalid UTF-8 so it gets converted to the replacement character try std.testing.expectEqual(@as(u16, 0xFFFD), try parseAcceleratorKeyString( .{ .slice = "\"\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0xCCAC), try parseAcceleratorKeyString( .{ .slice = "\"\x80\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // This also behaves the same with escaped characters try std.testing.expectEqual(@as(u16, 0x20AC), try parseAcceleratorKeyString( .{ .slice = "\"\\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // Even with utf8 code page try std.testing.expectEqual(@as(u16, 0x20AC), try parseAcceleratorKeyString( .{ .slice = "\"\\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0xCCAC), try parseAcceleratorKeyString( .{ .slice = "\"\\x80\\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // Wide string with the actual characters behaves like the ASCII string version try std.testing.expectEqual(@as(u16, 0xCCAC), try parseAcceleratorKeyString( .{ .slice = "L\"\x80\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // But wide string with escapes behaves differently try std.testing.expectEqual(@as(u16, 0x8080), try parseAcceleratorKeyString( .{ .slice = "L\"\\x80\\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // and invalid escapes within wide strings get skipped try std.testing.expectEqual(@as(u16, 'z'), try parseAcceleratorKeyString( .{ .slice = "L\"\\Hz\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // any non-A-Z codepoints are illegal try std.testing.expectError(error.ControlCharacterOutOfRange, parseAcceleratorKeyString( .{ .slice = "\"^\x83\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.ControlCharacterOutOfRange, parseAcceleratorKeyString( .{ .slice = "\"^1\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.InvalidControlCharacter, parseAcceleratorKeyString( .{ .slice = "\"^\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.EmptyAccelerator, parseAcceleratorKeyString( .{ .slice = "\"\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.AcceleratorTooLong, parseAcceleratorKeyString( .{ .slice = "\"hello\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.ControlCharacterOutOfRange, parseAcceleratorKeyString( .{ .slice = "\"^\x80\"", .code_page = .windows1252 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // Invalid UTF-8 gets converted to 0xFFFD, multiple invalids get shifted and added together @@ -838,40 +934,81 @@ test "accelerator keys" { try std.testing.expectEqual(@as(u16, 0xFCFD), try parseAcceleratorKeyString( .{ .slice = "\"\x80\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0xFCFD), try parseAcceleratorKeyString( .{ .slice = "L\"\x80\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // Codepoints >= 0x10000 try std.testing.expectEqual(@as(u16, 0xDD00), try parseAcceleratorKeyString( .{ .slice = "\"\xF0\x90\x84\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0xDD00), try parseAcceleratorKeyString( .{ .slice = "L\"\xF0\x90\x84\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectEqual(@as(u16, 0x9C01), try parseAcceleratorKeyString( .{ .slice = "\"\xF4\x80\x80\x81\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); // anything before or after a codepoint >= 0x10000 causes an error try std.testing.expectError(error.AcceleratorTooLong, parseAcceleratorKeyString( .{ .slice = "\"a\xF0\x90\x80\x80\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, )); try std.testing.expectError(error.AcceleratorTooLong, parseAcceleratorKeyString( .{ .slice = "\"\xF0\x90\x80\x80a\"", .code_page = .utf8 }, false, - .{}, + .{ .output_code_page = .windows1252 }, + )); + + // Misc special cases + try std.testing.expectEqual(@as(u16, 0xFFFD), try parseAcceleratorKeyString( + .{ .slice = "\"\\777\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + try std.testing.expectEqual(@as(u16, 0xFFFF), try parseAcceleratorKeyString( + .{ .slice = "L\"\\7777777\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + try std.testing.expectEqual(@as(u16, 0x01), try parseAcceleratorKeyString( + .{ .slice = "L\"\\200001\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Escape of a codepoint >= 0x10000 omits the high surrogate pair + try std.testing.expectEqual(@as(u16, 0xDF48), try parseAcceleratorKeyString( + .{ .slice = "L\"\\𐍈\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Invalid escape code is skipped, allows for 2 codepoints afterwards + try std.testing.expectEqual(@as(u16, 0x7878), try parseAcceleratorKeyString( + .{ .slice = "L\"\\kxx\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Escape of a codepoint >= 0x10000 allows for a codepoint afterwards + try std.testing.expectEqual(@as(u16, 0x4878), try parseAcceleratorKeyString( + .{ .slice = "L\"\\𐍈x\"", .code_page = .utf8 }, + false, + .{ .output_code_page = .utf8 }, + )); + // Input code page of 1252, output code page of utf-8 + try std.testing.expectEqual(@as(u16, 0xFFFD), try parseAcceleratorKeyString( + .{ .slice = "\"\\270\"", .code_page = .windows1252 }, + false, + .{ .output_code_page = .utf8 }, )); } diff --git a/lib/compiler/resinator/source_mapping.zig b/lib/compiler/resinator/source_mapping.zig index ba396b019c89..c6ffc527ff01 100644 --- a/lib/compiler/resinator/source_mapping.zig +++ b/lib/compiler/resinator/source_mapping.zig @@ -38,7 +38,7 @@ pub const ParseAndRemoveLineCommandsOptions = struct { /// /// If `options.initial_filename` is provided, that filename is guaranteed to be /// within the `mappings.files` table and `root_filename_offset` will be set appropriately. -pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: []u8, options: ParseAndRemoveLineCommandsOptions) !ParseLineCommandsResult { +pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: []u8, options: ParseAndRemoveLineCommandsOptions) error{ OutOfMemory, InvalidLineCommand, LineNumberOverflow }!ParseLineCommandsResult { var parse_result = ParseLineCommandsResult{ .result = undefined, .mappings = .{}, @@ -53,12 +53,41 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: parse_result.mappings.root_filename_offset = try parse_result.mappings.files.put(allocator, initial_filename); } + // This implementation attempts to be comment and string aware in order + // to avoid errant #line "" within multiline comments + // leading to problems in the source mapping after comments are removed, + // but it is not a perfect implementation (intentionally). + // + // The current implementation does not handle cases like + // /* foo */ #line ... + // #line ... // foo + // #line ... /* foo ... + // etc + // + // (the first example will not be recognized as a #line command, the second + // and third will error with InvalidLineCommand) + // + // This is fine, though, since #line commands are generated by the + // preprocessor so in normal circumstances they will be well-formed and + // consistent. The only realistic way the imperfect implementation could + // affect a 'real' use-case would be someone taking the output of a + // preprocessor, editing it manually to add comments before/after #line + // commands, and then running it through resinator with /:no-preprocess. + std.debug.assert(buf.len >= source.len); var result = UncheckedSliceWriter{ .slice = buf }; const State = enum { line_start, preprocessor, non_preprocessor, + forward_slash, + line_comment, + multiline_comment, + multiline_comment_end, + single_quoted, + single_quoted_escape, + double_quoted, + double_quoted_escape, }; var state: State = .line_start; var index: usize = 0; @@ -66,8 +95,8 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: var preprocessor_start: usize = 0; var line_number: usize = 1; while (index < source.len) : (index += 1) { - const c = source[index]; - switch (state) { + var c = source[index]; + state: switch (state) { .line_start => switch (c) { '#' => { preprocessor_start = index; @@ -93,6 +122,27 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: pending_start = index; } }, + '/' => { + if (!current_mapping.ignore_contents) { + result.writeSlice(source[pending_start orelse index .. index + 1]); + pending_start = null; + } + state = .forward_slash; + }, + '\'' => { + if (!current_mapping.ignore_contents) { + result.writeSlice(source[pending_start orelse index .. index + 1]); + pending_start = null; + } + state = .single_quoted; + }, + '"' => { + if (!current_mapping.ignore_contents) { + result.writeSlice(source[pending_start orelse index .. index + 1]); + pending_start = null; + } + state = .double_quoted; + }, else => { state = .non_preprocessor; if (pending_start != null) { @@ -107,25 +157,246 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: } }, }, + .forward_slash => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + '/' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .line_comment; + }, + '*' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .multiline_comment; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .non_preprocessor; + }, + }, + .line_comment => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + }, + }, + .multiline_comment => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + pending_start = null; + }, + '*' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .multiline_comment_end; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + }, + }, + .multiline_comment_end => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .multiline_comment; + pending_start = null; + }, + '/' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .non_preprocessor; + }, + '*' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + // stay in multiline_comment_end state + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .multiline_comment; + }, + }, + .single_quoted => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + '\\' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .single_quoted_escape; + }, + '\'' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .non_preprocessor; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + }, + }, + .single_quoted_escape => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .single_quoted; + }, + }, + .double_quoted => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + '\\' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .double_quoted_escape; + }, + '"' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .non_preprocessor; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + }, + }, + .double_quoted_escape => switch (c) { + '\r', '\n' => { + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (!current_mapping.ignore_contents) { + try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); + + result.write(c); + if (is_crlf) result.write(source[index + 1]); + line_number += 1; + } + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; + }, + else => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .double_quoted; + }, + }, .preprocessor => switch (c) { '\r', '\n' => { // Now that we have the full line we can decide what to do with it const preprocessor_str = source[preprocessor_start..index]; - const is_crlf = formsLineEndingPair(source, c, index + 1); if (std.mem.startsWith(u8, preprocessor_str, "#line")) { try handleLineCommand(allocator, preprocessor_str, ¤t_mapping); + const is_crlf = formsLineEndingPair(source, c, index + 1); + if (is_crlf) index += 1; + state = .line_start; + pending_start = null; } else { - if (!current_mapping.ignore_contents) { - try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); - - const line_ending_len: usize = if (is_crlf) 2 else 1; - result.writeSlice(source[pending_start.? .. index + line_ending_len]); - line_number += 1; - } + // Backtrack and reparse the line in the non_preprocessor state, + // since it's possible that this line contains a multiline comment + // start, etc. + state = .non_preprocessor; + index = pending_start.?; + pending_start = null; + // TODO: This is a hacky way to implement this, c needs to be + // updated since we're using continue :state here + c = source[index]; + // continue to avoid the index += 1 of the while loop + continue :state .non_preprocessor; } - if (is_crlf) index += 1; - state = .line_start; - pending_start = null; }, else => {}, }, @@ -143,6 +414,24 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: state = .line_start; pending_start = null; }, + '/' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .forward_slash; + }, + '\'' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .single_quoted; + }, + '"' => { + if (!current_mapping.ignore_contents) { + result.write(c); + } + state = .double_quoted; + }, else => { if (!current_mapping.ignore_contents) { result.write(c); @@ -153,7 +442,16 @@ pub fn parseAndRemoveLineCommands(allocator: Allocator, source: []const u8, buf: } else { switch (state) { .line_start => {}, - .non_preprocessor => { + .forward_slash, + .line_comment, + .multiline_comment, + .multiline_comment_end, + .single_quoted, + .single_quoted_escape, + .double_quoted, + .double_quoted_escape, + .non_preprocessor, + => { try handleLineEnd(allocator, line_number, &parse_result.mappings, ¤t_mapping); }, .preprocessor => { @@ -207,34 +505,40 @@ pub fn handleLineEnd(allocator: Allocator, post_processed_line_number: usize, ma try mapping.set(post_processed_line_number, current_mapping.line_num, filename_offset); - current_mapping.line_num += 1; + current_mapping.line_num = std.math.add(usize, current_mapping.line_num, 1) catch return error.LineNumberOverflow; current_mapping.pending = false; } // TODO: Might want to provide diagnostics on invalid line commands instead of just returning -pub fn handleLineCommand(allocator: Allocator, line_command: []const u8, current_mapping: *CurrentMapping) error{OutOfMemory}!void { +pub fn handleLineCommand(allocator: Allocator, line_command: []const u8, current_mapping: *CurrentMapping) error{ OutOfMemory, InvalidLineCommand }!void { // TODO: Are there other whitespace characters that should be included? var tokenizer = std.mem.tokenizeAny(u8, line_command, " \t"); - const line_directive = tokenizer.next() orelse return; // #line - if (!std.mem.eql(u8, line_directive, "#line")) return; - const linenum_str = tokenizer.next() orelse return; - const linenum = std.fmt.parseUnsigned(usize, linenum_str, 10) catch return; + const line_directive = tokenizer.next() orelse return error.InvalidLineCommand; // #line + if (!std.mem.eql(u8, line_directive, "#line")) return error.InvalidLineCommand; + const linenum_str = tokenizer.next() orelse return error.InvalidLineCommand; + const linenum = std.fmt.parseUnsigned(usize, linenum_str, 10) catch return error.InvalidLineCommand; + if (linenum == 0) return error.InvalidLineCommand; var filename_literal = tokenizer.rest(); while (filename_literal.len > 0 and std.ascii.isWhitespace(filename_literal[filename_literal.len - 1])) { filename_literal.len -= 1; } - if (filename_literal.len < 2) return; + if (filename_literal.len < 2) return error.InvalidLineCommand; const is_quoted = filename_literal[0] == '"' and filename_literal[filename_literal.len - 1] == '"'; - if (!is_quoted) return; - const filename = parseFilename(allocator, filename_literal[1 .. filename_literal.len - 1]) catch |err| switch (err) { + if (!is_quoted) return error.InvalidLineCommand; + const unquoted_filename = filename_literal[1 .. filename_literal.len - 1]; + + // Ignore and + if (std.mem.eql(u8, unquoted_filename, "") or std.mem.eql(u8, unquoted_filename, "")) return; + + const filename = parseFilename(allocator, unquoted_filename) catch |err| switch (err) { error.OutOfMemory => |e| return e, - else => return, + else => return error.InvalidLineCommand, }; defer allocator.free(filename); // \x00 bytes in the filename is incompatible with how StringTable works - if (std.mem.indexOfScalar(u8, filename, '\x00') != null) return; + if (std.mem.indexOfScalar(u8, filename, '\x00') != null) return error.InvalidLineCommand; current_mapping.line_num = linenum; current_mapping.filename.clearRetainingCapacity(); @@ -494,8 +798,12 @@ pub const SourceMappings = struct { if (node.key.filename_offset != filename_offset) { break :need_new_node true; } - const exist_delta = @as(i64, @intCast(node.key.corresponding_start_line)) - @as(i64, @intCast(node.key.start_line)); - const cur_delta = @as(i64, @intCast(corresponding_line_num)) - @as(i64, @intCast(line_num)); + // TODO: These use i65 to avoid truncation when any of the line number values + // use all 64 bits of the usize. In reality, line numbers can't really + // get that large so limiting the line number and using a smaller iX + // type here might be a better solution. + const exist_delta = @as(i65, @intCast(node.key.corresponding_start_line)) - @as(i65, @intCast(node.key.start_line)); + const cur_delta = @as(i65, @intCast(corresponding_line_num)) - @as(i65, @intCast(line_num)); if (exist_delta != cur_delta) { break :need_new_node true; } @@ -578,15 +886,8 @@ pub const SourceMappings = struct { inorder_node.key.start_line -= span_diff; // This can only really happen if there are #line commands within - // a multiline comment, which in theory should be skipped over. - // However, currently, parseAndRemoveLineCommands is not aware of - // comments at all. - // - // TODO: Make parseAndRemoveLineCommands aware of comments/strings - // and turn this into an assertion - if (prev.key.start_line > inorder_node.key.start_line) { - return error.InvalidSourceMappingCollapse; - } + // a multiline comment, which should be skipped over. + std.debug.assert(prev.key.start_line <= inorder_node.key.start_line); prev = inorder_node; } self.end_line -= span_diff; @@ -594,7 +895,7 @@ pub const SourceMappings = struct { /// Returns true if the line is from the main/root file (i.e. not a file that has been /// `#include`d). - pub fn isRootFile(self: *SourceMappings, line_num: usize) bool { + pub fn isRootFile(self: *const SourceMappings, line_num: usize) bool { const source = self.get(line_num) orelse return false; return source.filename_offset == self.root_filename_offset; } @@ -803,9 +1104,6 @@ test "in place" { } test "line command within a multiline comment" { - // TODO: Enable once parseAndRemoveLineCommands is comment-aware - if (true) return error.SkipZigTest; - try testParseAndRemoveLineCommands( \\/* \\#line 1 "irrelevant.rc" @@ -825,4 +1123,103 @@ test "line command within a multiline comment" { \\ \\*/ , .{ .initial_filename = "blah.rc" }); + + // * but without / directly after + try testParseAndRemoveLineCommands( + \\/** / + \\#line 1 "irrelevant.rc" + \\*/ + , &[_]ExpectedSourceSpan{ + .{ .start_line = 1, .end_line = 1, .filename = "blah.rc" }, + .{ .start_line = 2, .end_line = 2, .filename = "blah.rc" }, + .{ .start_line = 3, .end_line = 3, .filename = "blah.rc" }, + }, + \\/** / + \\#line 1 "irrelevant.rc" + \\*/ + , .{ .initial_filename = "blah.rc" }); + + // /** and **/ + try testParseAndRemoveLineCommands( + \\/** + \\#line 1 "irrelevant.rc" + \\**/ + \\foo + , &[_]ExpectedSourceSpan{ + .{ .start_line = 1, .end_line = 1, .filename = "blah.rc" }, + .{ .start_line = 2, .end_line = 2, .filename = "blah.rc" }, + .{ .start_line = 3, .end_line = 3, .filename = "blah.rc" }, + .{ .start_line = 20, .end_line = 20, .filename = "blah.rc" }, + }, + \\/** + \\#line 1 "irrelevant.rc" + \\**/ + \\#line 20 "blah.rc" + \\foo + , .{ .initial_filename = "blah.rc" }); +} + +test "whitespace preservation" { + try testParseAndRemoveLineCommands( + \\ / + \\/ + , &[_]ExpectedSourceSpan{ + .{ .start_line = 1, .end_line = 1, .filename = "blah.rc" }, + .{ .start_line = 2, .end_line = 2, .filename = "blah.rc" }, + }, + \\ / + \\/ + , .{ .initial_filename = "blah.rc" }); +} + +test "preprocessor line with a multiline comment after" { + try testParseAndRemoveLineCommands( + \\#pragma test /* + \\ + \\*/ + , &[_]ExpectedSourceSpan{ + .{ .start_line = 1, .end_line = 1, .filename = "blah.rc" }, + .{ .start_line = 2, .end_line = 2, .filename = "blah.rc" }, + .{ .start_line = 3, .end_line = 3, .filename = "blah.rc" }, + }, + \\#pragma test /* + \\ + \\*/ + , .{ .initial_filename = "blah.rc" }); +} + +test "comment after line command" { + var mut_source = "#line 1 \"blah.rc\" /*".*; + try std.testing.expectError(error.InvalidLineCommand, parseAndRemoveLineCommands(std.testing.allocator, &mut_source, &mut_source, .{})); +} + +test "line command with 0 as line number" { + var mut_source = "#line 0 \"blah.rc\"".*; + try std.testing.expectError(error.InvalidLineCommand, parseAndRemoveLineCommands(std.testing.allocator, &mut_source, &mut_source, .{})); +} + +test "line number limits" { + // TODO: Avoid usize for line numbers + if (@sizeOf(usize) != 8) return error.SkipZigTest; + + // greater than i64 max + try testParseAndRemoveLineCommands( + \\ + , &[_]ExpectedSourceSpan{ + .{ .start_line = 11111111111111111111, .end_line = 11111111111111111111, .filename = "blah.rc" }, + }, + \\#line 11111111111111111111 "blah.rc" + , .{ .initial_filename = "blah.rc" }); + + // equal to u64 max, overflows on line number increment + { + var mut_source = "#line 18446744073709551615 \"blah.rc\"".*; + try std.testing.expectError(error.LineNumberOverflow, parseAndRemoveLineCommands(std.testing.allocator, &mut_source, &mut_source, .{})); + } + + // greater than u64 max + { + var mut_source = "#line 18446744073709551616 \"blah.rc\"".*; + try std.testing.expectError(error.InvalidLineCommand, parseAndRemoveLineCommands(std.testing.allocator, &mut_source, &mut_source, .{})); + } }