diff --git a/src/browser/parser/Parser.zig b/src/browser/parser/Parser.zig
index 3b2bebcd..7e41ecb5 100644
--- a/src/browser/parser/Parser.zig
+++ b/src/browser/parser/Parser.zig
@@ -98,6 +98,29 @@ pub fn parse(self: *Parser, html: []const u8) void {
);
}
+pub fn parseXML(self: *Parser, xml: []const u8) void {
+ h5e.xml5ever_parse_document(
+ xml.ptr,
+ xml.len,
+ &self.container,
+ self,
+ createElementCallback,
+ getDataCallback,
+ appendCallback,
+ parseErrorCallback,
+ popCallback,
+ createCommentCallback,
+ createProcessingInstruction,
+ appendDoctypeToDocument,
+ addAttrsIfMissingCallback,
+ getTemplateContentsCallback,
+ removeFromParentCallback,
+ reparentChildrenCallback,
+ appendBeforeSiblingCallback,
+ appendBasedOnParentNodeCallback,
+ );
+}
+
pub fn parseFragment(self: *Parser, html: []const u8) void {
h5e.html5ever_parse_fragment(
html.ptr,
diff --git a/src/browser/parser/html5ever.zig b/src/browser/parser/html5ever.zig
index afa22494..8ee873e2 100644
--- a/src/browser/parser/html5ever.zig
+++ b/src/browser/parser/html5ever.zig
@@ -171,3 +171,24 @@ pub const NodeOrText = extern struct {
text: []const u8,
};
};
+
+pub extern "c" fn xml5ever_parse_document(
+ html: [*c]const u8,
+ len: usize,
+ doc: *anyopaque,
+ ctx: *anyopaque,
+ createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
+ elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
+ appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
+ popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
+ createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
+ createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
+ appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
+ addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
+ getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
+ removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
+ reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
+ appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
+ appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
+) void;
diff --git a/src/browser/tests/domparser.html b/src/browser/tests/domparser.html
index cd701d59..7f08b1ef 100644
--- a/src/browser/tests/domparser.html
+++ b/src/browser/tests/domparser.html
@@ -107,19 +107,6 @@
}
-
-
+
+
diff --git a/src/browser/webapi/DOMParser.zig b/src/browser/webapi/DOMParser.zig
index eb2d95d8..453b83de 100644
--- a/src/browser/webapi/DOMParser.zig
+++ b/src/browser/webapi/DOMParser.zig
@@ -19,8 +19,13 @@
const std = @import("std");
const js = @import("../js/js.zig");
+
const Page = @import("../Page.zig");
+const Parser = @import("../parser/Parser.zig");
+
const HTMLDocument = @import("HTMLDocument.zig");
+const XMLDocument = @import("XMLDocument.zig");
+const ProcessingInstruction = @import("../webapi/cdata/ProcessingInstruction.zig");
const DOMParser = @This();
@@ -28,34 +33,78 @@ pub fn init() DOMParser {
return .{};
}
-pub fn parseFromString(self: *const DOMParser, html: []const u8, mime_type: []const u8, page: *Page) !*HTMLDocument {
- _ = self;
+pub const HTMLDocumentOrXMLDocument = union(enum) {
+ html_document: *HTMLDocument,
+ xml_document: *XMLDocument,
+};
- // For now, only support text/html
- if (!std.mem.eql(u8, mime_type, "text/html")) {
- return error.NotSupported;
- }
+pub fn parseFromString(
+ _: *const DOMParser,
+ html: []const u8,
+ mime_type: []const u8,
+ page: *Page,
+) !HTMLDocumentOrXMLDocument {
+ const maybe_target_mime = std.meta.stringToEnum(enum {
+ @"text/html",
+ @"text/xml",
+ @"application/xml",
+ @"application/xhtml+xml",
+ @"image/svg+xml",
+ }, mime_type);
- // Create a new HTMLDocument
- const doc = try page._factory.document(HTMLDocument{
- ._proto = undefined,
- });
+ if (maybe_target_mime) |target_mime| switch (target_mime) {
+ .@"text/html" => {
+ // Create a new HTMLDocument
+ const doc = try page._factory.document(HTMLDocument{
+ ._proto = undefined,
+ });
- var normalized = std.mem.trim(u8, html, &std.ascii.whitespace);
- if (normalized.len == 0) {
- normalized = "";
- }
+ var normalized = std.mem.trim(u8, html, &std.ascii.whitespace);
+ if (normalized.len == 0) {
+ normalized = "";
+ }
- // Parse HTML into the document
- const Parser = @import("../parser/Parser.zig");
- var parser = Parser.init(page.arena, doc.asNode(), page);
- parser.parse(normalized);
+ // Parse HTML into the document
+ var parser = Parser.init(page.arena, doc.asNode(), page);
+ parser.parse(normalized);
- if (parser.err) |pe| {
- return pe.err;
- }
+ if (parser.err) |pe| {
+ return pe.err;
+ }
- return doc;
+ return .{ .html_document = doc };
+ },
+ else => {
+ // Create a new XMLDocument.
+ const doc = try page._factory.document(XMLDocument{
+ ._proto = undefined,
+ });
+
+ // Parse XML into XMLDocument.
+ const doc_node = doc.asNode();
+ var parser = Parser.init(page.arena, doc_node, page);
+ parser.parseXML(html);
+
+ if (parser.err) |pe| {
+ return pe.err;
+ }
+
+ // If first node is a `ProcessingInstruction`, skip it.
+ const first_child = doc_node.firstChild() orelse {
+ // Parsing should fail if there aren't any nodes.
+ unreachable;
+ };
+
+ if (first_child.getNodeType() == 7) {
+ // We're sure that firstChild exist, this cannot fail.
+ _ = doc_node.removeChild(first_child, page) catch unreachable;
+ }
+
+ return .{ .xml_document = doc };
+ },
+ };
+
+ return error.NotSupported;
}
pub const JsApi = struct {
diff --git a/src/html5ever/Cargo.lock b/src/html5ever/Cargo.lock
index 60c82b59..d94a7fd7 100644
--- a/src/html5ever/Cargo.lock
+++ b/src/html5ever/Cargo.lock
@@ -72,6 +72,7 @@ dependencies = [
"tikv-jemalloc-ctl",
"tikv-jemallocator",
"typed-arena",
+ "xml5ever",
]
[[package]]
@@ -476,3 +477,13 @@ name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "xml5ever"
+version = "0.35.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494"
+dependencies = [
+ "log",
+ "markup5ever",
+]
diff --git a/src/html5ever/Cargo.toml b/src/html5ever/Cargo.toml
index 7cc94245..b4004404 100644
--- a/src/html5ever/Cargo.toml
+++ b/src/html5ever/Cargo.toml
@@ -14,6 +14,7 @@ string_cache = "0.9.0"
typed-arena = "2.0.2"
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
+xml5ever = "0.35.0"
[profile.release]
lto = true
diff --git a/src/html5ever/lib.rs b/src/html5ever/lib.rs
index 308001de..59ef6890 100644
--- a/src/html5ever/lib.rs
+++ b/src/html5ever/lib.rs
@@ -16,20 +16,20 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
-mod types;
mod sink;
+mod types;
#[cfg(debug_assertions)]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-use types::*;
use std::cell::Cell;
use std::os::raw::{c_uchar, c_void};
+use types::*;
-use html5ever::{parse_document, parse_fragment, QualName, LocalName, ns, ParseOpts, Parser};
-use html5ever::tendril::{TendrilSink, StrTendril};
use html5ever::interface::tree_builder::QuirksMode;
+use html5ever::tendril::{StrTendril, TendrilSink};
+use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
#[no_mangle]
pub extern "C" fn html5ever_parse_document(
@@ -135,13 +135,14 @@ pub extern "C" fn html5ever_parse_fragment(
let bytes = unsafe { std::slice::from_raw_parts(html, len) };
parse_fragment(
- sink, Default::default(),
+ sink,
+ Default::default(),
QualName::new(None, ns!(html), LocalName::from("body")),
- vec![], // attributes
- false, // context_element_allows_scripting
+ vec![], // attributes
+ false, // context_element_allows_scripting
)
- .from_utf8()
- .one(bytes);
+ .from_utf8()
+ .one(bytes);
}
#[no_mangle]
@@ -182,15 +183,15 @@ pub struct Memory {
#[cfg(debug_assertions)]
#[no_mangle]
pub extern "C" fn html5ever_get_memory_usage() -> Memory {
- use tikv_jemalloc_ctl::{stats, epoch};
+ use tikv_jemalloc_ctl::{epoch, stats};
// many statistics are cached and only updated when the epoch is advanced.
epoch::advance().unwrap();
- return Memory{
+ return Memory {
resident: stats::resident::read().unwrap(),
allocated: stats::allocated::read().unwrap(),
- }
+ };
}
// Streaming parser API
@@ -225,9 +226,8 @@ pub extern "C" fn html5ever_streaming_parser_create(
// SAFETY: We're creating a self-referential structure here.
// The arena is stored in the StreamingParser and lives as long as the parser.
// The sink contains a reference to the arena that's valid for the parser's lifetime.
- let arena_ref: &'static typed_arena::Arena = unsafe {
- std::mem::transmute(arena.as_ref())
- };
+ let arena_ref: &'static typed_arena::Arena =
+ unsafe { std::mem::transmute(arena.as_ref()) };
let sink = sink::Sink {
ctx: ctx,
@@ -281,7 +281,8 @@ pub extern "C" fn html5ever_streaming_parser_feed(
// Feed the chunk to the parser
// The Parser implements TendrilSink, so we can call process() on it
- let parser = streaming_parser.parser
+ let parser = streaming_parser
+ .parser
.downcast_mut::>()
.expect("Invalid parser type");
@@ -304,7 +305,8 @@ pub extern "C" fn html5ever_streaming_parser_finish(parser_ptr: *mut c_void) {
let streaming_parser = unsafe { Box::from_raw(parser_ptr as *mut StreamingParser) };
// Extract and finish the parser
- let parser = streaming_parser.parser
+ let parser = streaming_parser
+ .parser
.downcast::>()
.expect("Invalid parser type");
@@ -326,3 +328,57 @@ pub extern "C" fn html5ever_streaming_parser_destroy(parser_ptr: *mut c_void) {
let _ = Box::from_raw(parser_ptr as *mut StreamingParser);
}
}
+
+#[no_mangle]
+pub extern "C" fn xml5ever_parse_document(
+ xml: *mut c_uchar,
+ len: usize,
+ document: Ref,
+ ctx: Ref,
+ create_element_callback: CreateElementCallback,
+ get_data_callback: GetDataCallback,
+ append_callback: AppendCallback,
+ parse_error_callback: ParseErrorCallback,
+ pop_callback: PopCallback,
+ create_comment_callback: CreateCommentCallback,
+ create_processing_instruction: CreateProcessingInstruction,
+ append_doctype_to_document: AppendDoctypeToDocumentCallback,
+ add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
+ get_template_contents_callback: GetTemplateContentsCallback,
+ remove_from_parent_callback: RemoveFromParentCallback,
+ reparent_children_callback: ReparentChildrenCallback,
+ append_before_sibling_callback: AppendBeforeSiblingCallback,
+ append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
+) -> () {
+ if xml.is_null() || len == 0 {
+ return ();
+ }
+
+ let arena = typed_arena::Arena::new();
+
+ let sink = sink::Sink {
+ ctx: ctx,
+ arena: &arena,
+ document: document,
+ quirks_mode: Cell::new(QuirksMode::NoQuirks),
+ pop_callback: pop_callback,
+ append_callback: append_callback,
+ get_data_callback: get_data_callback,
+ parse_error_callback: parse_error_callback,
+ create_element_callback: create_element_callback,
+ create_comment_callback: create_comment_callback,
+ create_processing_instruction: create_processing_instruction,
+ append_doctype_to_document: append_doctype_to_document,
+ add_attrs_if_missing_callback: add_attrs_if_missing_callback,
+ get_template_contents_callback: get_template_contents_callback,
+ remove_from_parent_callback: remove_from_parent_callback,
+ reparent_children_callback: reparent_children_callback,
+ append_before_sibling_callback: append_before_sibling_callback,
+ append_based_on_parent_node_callback: append_based_on_parent_node_callback,
+ };
+
+ let bytes = unsafe { std::slice::from_raw_parts(xml, len) };
+ xml5ever::driver::parse_document(sink, xml5ever::driver::XmlParseOpts::default())
+ .from_utf8()
+ .one(bytes);
+}