diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig index 3b2bebcd..7e41ecb5 100644 --- a/src/browser/parser/Parser.zig +++ b/src/browser/parser/Parser.zig @@ -98,6 +98,29 @@ pub fn parse(self: *Parser, html: []const u8) void { ); } +pub fn parseXML(self: *Parser, xml: []const u8) void { + h5e.xml5ever_parse_document( + xml.ptr, + xml.len, + &self.container, + self, + createElementCallback, + getDataCallback, + appendCallback, + parseErrorCallback, + popCallback, + createCommentCallback, + createProcessingInstruction, + appendDoctypeToDocument, + addAttrsIfMissingCallback, + getTemplateContentsCallback, + removeFromParentCallback, + reparentChildrenCallback, + appendBeforeSiblingCallback, + appendBasedOnParentNodeCallback, + ); +} + pub fn parseFragment(self: *Parser, html: []const u8) void { h5e.html5ever_parse_fragment( html.ptr, diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig index afa22494..8ee873e2 100644 --- a/src/browser/parser/html5ever.zig +++ b/src/browser/parser/html5ever.zig @@ -171,3 +171,24 @@ pub const NodeOrText = extern struct { text: []const u8, }; }; + +pub extern "c" fn xml5ever_parse_document( + html: [*c]const u8, + len: usize, + doc: *anyopaque, + ctx: *anyopaque, + createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque, + elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque, + appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void, + parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void, + popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void, + createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque, + createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque, + appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void, + addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void, + getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque, + removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void, + reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void, + appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void, + appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void, +) void; diff --git a/src/browser/webapi/DOMParser.zig b/src/browser/webapi/DOMParser.zig index eb2d95d8..46ddd14b 100644 --- a/src/browser/webapi/DOMParser.zig +++ b/src/browser/webapi/DOMParser.zig @@ -21,6 +21,7 @@ const std = @import("std"); const js = @import("../js/js.zig"); const Page = @import("../Page.zig"); const HTMLDocument = @import("HTMLDocument.zig"); +const XMLDocument = @import("XMLDocument.zig"); const DOMParser = @This(); @@ -28,34 +29,59 @@ pub fn init() DOMParser { return .{}; } -pub fn parseFromString(self: *const DOMParser, html: []const u8, mime_type: []const u8, page: *Page) !*HTMLDocument { - _ = self; +pub const HTMLDocumentOrXMLDocument = union(enum) { + html_document: *HTMLDocument, + xml_document: *XMLDocument, +}; - // For now, only support text/html - if (!std.mem.eql(u8, mime_type, "text/html")) { - return error.NotSupported; +pub fn parseFromString( + _: *const DOMParser, + html: []const u8, + mime_type: []const u8, + page: *Page, +) !HTMLDocumentOrXMLDocument { + if (std.mem.eql(u8, mime_type, "text/html")) { + // Create a new HTMLDocument + const doc = try page._factory.document(HTMLDocument{ + ._proto = undefined, + }); + + var normalized = std.mem.trim(u8, html, &std.ascii.whitespace); + if (normalized.len == 0) { + normalized = ""; + } + + // Parse HTML into the document + const Parser = @import("../parser/Parser.zig"); + var parser = Parser.init(page.arena, doc.asNode(), page); + parser.parse(normalized); + + if (parser.err) |pe| { + return pe.err; + } + + return .{ .html_document = doc }; } - // Create a new HTMLDocument - const doc = try page._factory.document(HTMLDocument{ - ._proto = undefined, - }); + if (std.mem.eql(u8, mime_type, "text/xml")) { + // Create a new XMLDocument. + const doc = try page._factory.document(XMLDocument{ + ._proto = undefined, + }); - var normalized = std.mem.trim(u8, html, &std.ascii.whitespace); - if (normalized.len == 0) { - normalized = ""; + // Parse XML into XMLDocument. + const Parser = @import("../parser/Parser.zig"); + var parser = Parser.init(page.arena, doc.asNode(), page); + parser.parseXML(html); + + if (parser.err) |pe| { + return pe.err; + } + + return .{ .xml_document = doc }; } - // Parse HTML into the document - const Parser = @import("../parser/Parser.zig"); - var parser = Parser.init(page.arena, doc.asNode(), page); - parser.parse(normalized); - - if (parser.err) |pe| { - return pe.err; - } - - return doc; + return error.NotSupported; } pub const JsApi = struct { diff --git a/src/html5ever/Cargo.lock b/src/html5ever/Cargo.lock index 60c82b59..d94a7fd7 100644 --- a/src/html5ever/Cargo.lock +++ b/src/html5ever/Cargo.lock @@ -72,6 +72,7 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemallocator", "typed-arena", + "xml5ever", ] [[package]] @@ -476,3 +477,13 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "xml5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494" +dependencies = [ + "log", + "markup5ever", +] diff --git a/src/html5ever/Cargo.toml b/src/html5ever/Cargo.toml index 7cc94245..b4004404 100644 --- a/src/html5ever/Cargo.toml +++ b/src/html5ever/Cargo.toml @@ -14,6 +14,7 @@ string_cache = "0.9.0" typed-arena = "2.0.2" tikv-jemallocator = {version = "0.6.0", features = ["stats"]} tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]} +xml5ever = "0.35.0" [profile.release] lto = true diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs index 308001de..59ef6890 100644 --- a/src/html5ever/lib.rs +++ b/src/html5ever/lib.rs @@ -16,20 +16,20 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -mod types; mod sink; +mod types; #[cfg(debug_assertions)] #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -use types::*; use std::cell::Cell; use std::os::raw::{c_uchar, c_void}; +use types::*; -use html5ever::{parse_document, parse_fragment, QualName, LocalName, ns, ParseOpts, Parser}; -use html5ever::tendril::{TendrilSink, StrTendril}; use html5ever::interface::tree_builder::QuirksMode; +use html5ever::tendril::{StrTendril, TendrilSink}; +use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName}; #[no_mangle] pub extern "C" fn html5ever_parse_document( @@ -135,13 +135,14 @@ pub extern "C" fn html5ever_parse_fragment( let bytes = unsafe { std::slice::from_raw_parts(html, len) }; parse_fragment( - sink, Default::default(), + sink, + Default::default(), QualName::new(None, ns!(html), LocalName::from("body")), - vec![], // attributes - false, // context_element_allows_scripting + vec![], // attributes + false, // context_element_allows_scripting ) - .from_utf8() - .one(bytes); + .from_utf8() + .one(bytes); } #[no_mangle] @@ -182,15 +183,15 @@ pub struct Memory { #[cfg(debug_assertions)] #[no_mangle] pub extern "C" fn html5ever_get_memory_usage() -> Memory { - use tikv_jemalloc_ctl::{stats, epoch}; + use tikv_jemalloc_ctl::{epoch, stats}; // many statistics are cached and only updated when the epoch is advanced. epoch::advance().unwrap(); - return Memory{ + return Memory { resident: stats::resident::read().unwrap(), allocated: stats::allocated::read().unwrap(), - } + }; } // Streaming parser API @@ -225,9 +226,8 @@ pub extern "C" fn html5ever_streaming_parser_create( // SAFETY: We're creating a self-referential structure here. // The arena is stored in the StreamingParser and lives as long as the parser. // The sink contains a reference to the arena that's valid for the parser's lifetime. - let arena_ref: &'static typed_arena::Arena = unsafe { - std::mem::transmute(arena.as_ref()) - }; + let arena_ref: &'static typed_arena::Arena = + unsafe { std::mem::transmute(arena.as_ref()) }; let sink = sink::Sink { ctx: ctx, @@ -281,7 +281,8 @@ pub extern "C" fn html5ever_streaming_parser_feed( // Feed the chunk to the parser // The Parser implements TendrilSink, so we can call process() on it - let parser = streaming_parser.parser + let parser = streaming_parser + .parser .downcast_mut::>() .expect("Invalid parser type"); @@ -304,7 +305,8 @@ pub extern "C" fn html5ever_streaming_parser_finish(parser_ptr: *mut c_void) { let streaming_parser = unsafe { Box::from_raw(parser_ptr as *mut StreamingParser) }; // Extract and finish the parser - let parser = streaming_parser.parser + let parser = streaming_parser + .parser .downcast::>() .expect("Invalid parser type"); @@ -326,3 +328,57 @@ pub extern "C" fn html5ever_streaming_parser_destroy(parser_ptr: *mut c_void) { let _ = Box::from_raw(parser_ptr as *mut StreamingParser); } } + +#[no_mangle] +pub extern "C" fn xml5ever_parse_document( + xml: *mut c_uchar, + len: usize, + document: Ref, + ctx: Ref, + create_element_callback: CreateElementCallback, + get_data_callback: GetDataCallback, + append_callback: AppendCallback, + parse_error_callback: ParseErrorCallback, + pop_callback: PopCallback, + create_comment_callback: CreateCommentCallback, + create_processing_instruction: CreateProcessingInstruction, + append_doctype_to_document: AppendDoctypeToDocumentCallback, + add_attrs_if_missing_callback: AddAttrsIfMissingCallback, + get_template_contents_callback: GetTemplateContentsCallback, + remove_from_parent_callback: RemoveFromParentCallback, + reparent_children_callback: ReparentChildrenCallback, + append_before_sibling_callback: AppendBeforeSiblingCallback, + append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback, +) -> () { + if xml.is_null() || len == 0 { + return (); + } + + let arena = typed_arena::Arena::new(); + + let sink = sink::Sink { + ctx: ctx, + arena: &arena, + document: document, + quirks_mode: Cell::new(QuirksMode::NoQuirks), + pop_callback: pop_callback, + append_callback: append_callback, + get_data_callback: get_data_callback, + parse_error_callback: parse_error_callback, + create_element_callback: create_element_callback, + create_comment_callback: create_comment_callback, + create_processing_instruction: create_processing_instruction, + append_doctype_to_document: append_doctype_to_document, + add_attrs_if_missing_callback: add_attrs_if_missing_callback, + get_template_contents_callback: get_template_contents_callback, + remove_from_parent_callback: remove_from_parent_callback, + reparent_children_callback: reparent_children_callback, + append_before_sibling_callback: append_before_sibling_callback, + append_based_on_parent_node_callback: append_based_on_parent_node_callback, + }; + + let bytes = unsafe { std::slice::from_raw_parts(xml, len) }; + xml5ever::driver::parse_document(sink, xml5ever::driver::XmlParseOpts::default()) + .from_utf8() + .one(bytes); +}