mirror of
https://github.com/lightpanda-io/browser.git
synced 2025-10-30 07:31:47 +00:00
loader: parse html per chunk
This commit is contained in:
@@ -152,11 +152,13 @@ pub const Page = struct {
|
||||
|
||||
// dump writes the page content into the given file.
|
||||
pub fn dump(self: *Page, out: std.fs.File) !void {
|
||||
// no data loaded, nothin to do.
|
||||
if (self.raw_data == null) return;
|
||||
|
||||
// if no HTML document pointer available, dump the data content only.
|
||||
if (self.doc == null) return try out.writeAll(self.raw_data.?);
|
||||
if (self.doc == null) {
|
||||
// no data loaded, nothing to do.
|
||||
if (self.raw_data == null) return;
|
||||
return try out.writeAll(self.raw_data.?);
|
||||
}
|
||||
|
||||
// if the page has a pointer to a document, dumps the HTML.
|
||||
const root = try parser.documentGetDocumentElement(self.doc.?) orelse return;
|
||||
@@ -175,22 +177,19 @@ pub const Page = struct {
|
||||
// TODO handle fragment in url.
|
||||
|
||||
// load the data
|
||||
var result = try self.loader.fetch(self.allocator, self.uri);
|
||||
defer result.deinit();
|
||||
var resp = try self.loader.get(self.allocator, self.uri);
|
||||
defer resp.deinit();
|
||||
|
||||
log.info("GET {any} {d}", .{ self.uri, result.status });
|
||||
const req = resp.req;
|
||||
|
||||
log.info("GET {any} {d}", .{ self.uri, req.response.status });
|
||||
|
||||
// TODO handle redirection
|
||||
if (result.status != .ok) return error.BadStatusCode;
|
||||
|
||||
if (result.body == null) return error.NoBody;
|
||||
|
||||
// save the body into the page.
|
||||
self.raw_data = try self.allocator.dupe(u8, result.body.?);
|
||||
if (req.response.status != .ok) return error.BadStatusCode;
|
||||
|
||||
// TODO handle charset
|
||||
// https://html.spec.whatwg.org/#content-type
|
||||
const ct = result.headers.getFirstValue("Content-Type") orelse {
|
||||
const ct = req.response.headers.getFirstValue("Content-Type") orelse {
|
||||
// no content type in HTTP headers.
|
||||
// TODO try to sniff mime type from the body.
|
||||
log.info("no content-type HTTP header", .{});
|
||||
@@ -199,16 +198,19 @@ pub const Page = struct {
|
||||
const mime = try Mime.parse(ct);
|
||||
if (mime.eql(Mime.HTML)) {
|
||||
// TODO check content-type
|
||||
try self.loadHTMLDoc(&result);
|
||||
try self.loadHTMLDoc(req.reader());
|
||||
} else {
|
||||
log.info("non-HTML document: {s}", .{ct});
|
||||
|
||||
// save the body into the page.
|
||||
self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024);
|
||||
}
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/#read-html
|
||||
fn loadHTMLDoc(self: *Page, result: *FetchResult) !void {
|
||||
fn loadHTMLDoc(self: *Page, reader: anytype) !void {
|
||||
log.debug("parse html", .{});
|
||||
const html_doc = try parser.documentHTMLParseFromStr(result.body.?);
|
||||
const html_doc = try parser.documentHTMLParse(reader);
|
||||
const doc = parser.documentHTMLToDocument(html_doc);
|
||||
|
||||
// save a document's pointer in the page.
|
||||
|
||||
@@ -6,10 +6,12 @@ pub const Loader = struct {
|
||||
client: std.http.Client,
|
||||
|
||||
pub const Response = struct {
|
||||
req: std.http.Request,
|
||||
allocator: std.mem.Allocator,
|
||||
req: *std.http.Client.Request,
|
||||
|
||||
pub fn deinit(self: *Response) void {
|
||||
self.req.deinit();
|
||||
self.allocator.destroy(self.req);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -40,6 +42,36 @@ pub const Loader = struct {
|
||||
.payload = .none,
|
||||
});
|
||||
}
|
||||
|
||||
// see
|
||||
// https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch
|
||||
// for reference.
|
||||
// The caller is responsible for calling `deinit()` on the `Response`.
|
||||
pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response {
|
||||
var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{
|
||||
.{ .name = "User-Agent", .value = user_agent },
|
||||
.{ .name = "Accept", .value = "*/*" },
|
||||
.{ .name = "Accept-Language", .value = "en-US,en;q=0.5" },
|
||||
});
|
||||
defer headers.deinit();
|
||||
|
||||
var resp = Response{
|
||||
.allocator = allocator,
|
||||
.req = try allocator.create(std.http.Client.Request),
|
||||
};
|
||||
errdefer allocator.destroy(resp.req);
|
||||
|
||||
resp.req.* = try self.client.open(.GET, uri, headers, .{
|
||||
.handle_redirects = true, // TODO handle redirects manually
|
||||
});
|
||||
errdefer resp.req.deinit();
|
||||
|
||||
try resp.req.send(.{});
|
||||
try resp.req.finish();
|
||||
try resp.req.wait();
|
||||
|
||||
return resp;
|
||||
}
|
||||
};
|
||||
|
||||
test "basic url fetch" {
|
||||
|
||||
Reference in New Issue
Block a user