mirror of
https://github.com/lightpanda-io/browser.git
synced 2025-12-15 07:48:58 +00:00
loader: parse html per chunk
This commit is contained in:
@@ -152,11 +152,13 @@ pub const Page = struct {
|
|||||||
|
|
||||||
// dump writes the page content into the given file.
|
// dump writes the page content into the given file.
|
||||||
pub fn dump(self: *Page, out: std.fs.File) !void {
|
pub fn dump(self: *Page, out: std.fs.File) !void {
|
||||||
// no data loaded, nothin to do.
|
|
||||||
if (self.raw_data == null) return;
|
|
||||||
|
|
||||||
// if no HTML document pointer available, dump the data content only.
|
// if no HTML document pointer available, dump the data content only.
|
||||||
if (self.doc == null) return try out.writeAll(self.raw_data.?);
|
if (self.doc == null) {
|
||||||
|
// no data loaded, nothing to do.
|
||||||
|
if (self.raw_data == null) return;
|
||||||
|
return try out.writeAll(self.raw_data.?);
|
||||||
|
}
|
||||||
|
|
||||||
// if the page has a pointer to a document, dumps the HTML.
|
// if the page has a pointer to a document, dumps the HTML.
|
||||||
const root = try parser.documentGetDocumentElement(self.doc.?) orelse return;
|
const root = try parser.documentGetDocumentElement(self.doc.?) orelse return;
|
||||||
@@ -175,22 +177,19 @@ pub const Page = struct {
|
|||||||
// TODO handle fragment in url.
|
// TODO handle fragment in url.
|
||||||
|
|
||||||
// load the data
|
// load the data
|
||||||
var result = try self.loader.fetch(self.allocator, self.uri);
|
var resp = try self.loader.get(self.allocator, self.uri);
|
||||||
defer result.deinit();
|
defer resp.deinit();
|
||||||
|
|
||||||
log.info("GET {any} {d}", .{ self.uri, result.status });
|
const req = resp.req;
|
||||||
|
|
||||||
|
log.info("GET {any} {d}", .{ self.uri, req.response.status });
|
||||||
|
|
||||||
// TODO handle redirection
|
// TODO handle redirection
|
||||||
if (result.status != .ok) return error.BadStatusCode;
|
if (req.response.status != .ok) return error.BadStatusCode;
|
||||||
|
|
||||||
if (result.body == null) return error.NoBody;
|
|
||||||
|
|
||||||
// save the body into the page.
|
|
||||||
self.raw_data = try self.allocator.dupe(u8, result.body.?);
|
|
||||||
|
|
||||||
// TODO handle charset
|
// TODO handle charset
|
||||||
// https://html.spec.whatwg.org/#content-type
|
// https://html.spec.whatwg.org/#content-type
|
||||||
const ct = result.headers.getFirstValue("Content-Type") orelse {
|
const ct = req.response.headers.getFirstValue("Content-Type") orelse {
|
||||||
// no content type in HTTP headers.
|
// no content type in HTTP headers.
|
||||||
// TODO try to sniff mime type from the body.
|
// TODO try to sniff mime type from the body.
|
||||||
log.info("no content-type HTTP header", .{});
|
log.info("no content-type HTTP header", .{});
|
||||||
@@ -199,16 +198,19 @@ pub const Page = struct {
|
|||||||
const mime = try Mime.parse(ct);
|
const mime = try Mime.parse(ct);
|
||||||
if (mime.eql(Mime.HTML)) {
|
if (mime.eql(Mime.HTML)) {
|
||||||
// TODO check content-type
|
// TODO check content-type
|
||||||
try self.loadHTMLDoc(&result);
|
try self.loadHTMLDoc(req.reader());
|
||||||
} else {
|
} else {
|
||||||
log.info("non-HTML document: {s}", .{ct});
|
log.info("non-HTML document: {s}", .{ct});
|
||||||
|
|
||||||
|
// save the body into the page.
|
||||||
|
self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://html.spec.whatwg.org/#read-html
|
// https://html.spec.whatwg.org/#read-html
|
||||||
fn loadHTMLDoc(self: *Page, result: *FetchResult) !void {
|
fn loadHTMLDoc(self: *Page, reader: anytype) !void {
|
||||||
log.debug("parse html", .{});
|
log.debug("parse html", .{});
|
||||||
const html_doc = try parser.documentHTMLParseFromStr(result.body.?);
|
const html_doc = try parser.documentHTMLParse(reader);
|
||||||
const doc = parser.documentHTMLToDocument(html_doc);
|
const doc = parser.documentHTMLToDocument(html_doc);
|
||||||
|
|
||||||
// save a document's pointer in the page.
|
// save a document's pointer in the page.
|
||||||
|
|||||||
@@ -6,10 +6,12 @@ pub const Loader = struct {
|
|||||||
client: std.http.Client,
|
client: std.http.Client,
|
||||||
|
|
||||||
pub const Response = struct {
|
pub const Response = struct {
|
||||||
req: std.http.Request,
|
allocator: std.mem.Allocator,
|
||||||
|
req: *std.http.Client.Request,
|
||||||
|
|
||||||
pub fn deinit(self: *Response) void {
|
pub fn deinit(self: *Response) void {
|
||||||
self.req.deinit();
|
self.req.deinit();
|
||||||
|
self.allocator.destroy(self.req);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -40,6 +42,36 @@ pub const Loader = struct {
|
|||||||
.payload = .none,
|
.payload = .none,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// see
|
||||||
|
// https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch
|
||||||
|
// for reference.
|
||||||
|
// The caller is responsible for calling `deinit()` on the `Response`.
|
||||||
|
pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response {
|
||||||
|
var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{
|
||||||
|
.{ .name = "User-Agent", .value = user_agent },
|
||||||
|
.{ .name = "Accept", .value = "*/*" },
|
||||||
|
.{ .name = "Accept-Language", .value = "en-US,en;q=0.5" },
|
||||||
|
});
|
||||||
|
defer headers.deinit();
|
||||||
|
|
||||||
|
var resp = Response{
|
||||||
|
.allocator = allocator,
|
||||||
|
.req = try allocator.create(std.http.Client.Request),
|
||||||
|
};
|
||||||
|
errdefer allocator.destroy(resp.req);
|
||||||
|
|
||||||
|
resp.req.* = try self.client.open(.GET, uri, headers, .{
|
||||||
|
.handle_redirects = true, // TODO handle redirects manually
|
||||||
|
});
|
||||||
|
errdefer resp.req.deinit();
|
||||||
|
|
||||||
|
try resp.req.send(.{});
|
||||||
|
try resp.req.finish();
|
||||||
|
try resp.req.wait();
|
||||||
|
|
||||||
|
return resp;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
test "basic url fetch" {
|
test "basic url fetch" {
|
||||||
|
|||||||
Reference in New Issue
Block a user