mirror of
				https://github.com/lightpanda-io/browser.git
				synced 2025-10-30 15:41:48 +00:00 
			
		
		
		
	loader: parse html per chunk
This commit is contained in:
		| @@ -152,11 +152,13 @@ pub const Page = struct { | ||||
|  | ||||
|     // dump writes the page content into the given file. | ||||
|     pub fn dump(self: *Page, out: std.fs.File) !void { | ||||
|         // no data loaded, nothin to do. | ||||
|         if (self.raw_data == null) return; | ||||
|  | ||||
|         // if no HTML document pointer available, dump the data content only. | ||||
|         if (self.doc == null) return try out.writeAll(self.raw_data.?); | ||||
|         if (self.doc == null) { | ||||
|             // no data loaded, nothing to do. | ||||
|             if (self.raw_data == null) return; | ||||
|             return try out.writeAll(self.raw_data.?); | ||||
|         } | ||||
|  | ||||
|         // if the page has a pointer to a document, dumps the HTML. | ||||
|         const root = try parser.documentGetDocumentElement(self.doc.?) orelse return; | ||||
| @@ -175,22 +177,19 @@ pub const Page = struct { | ||||
|         // TODO handle fragment in url. | ||||
|  | ||||
|         // load the data | ||||
|         var result = try self.loader.fetch(self.allocator, self.uri); | ||||
|         defer result.deinit(); | ||||
|         var resp = try self.loader.get(self.allocator, self.uri); | ||||
|         defer resp.deinit(); | ||||
|  | ||||
|         log.info("GET {any} {d}", .{ self.uri, result.status }); | ||||
|         const req = resp.req; | ||||
|  | ||||
|         log.info("GET {any} {d}", .{ self.uri, req.response.status }); | ||||
|  | ||||
|         // TODO handle redirection | ||||
|         if (result.status != .ok) return error.BadStatusCode; | ||||
|  | ||||
|         if (result.body == null) return error.NoBody; | ||||
|  | ||||
|         // save the body into the page. | ||||
|         self.raw_data = try self.allocator.dupe(u8, result.body.?); | ||||
|         if (req.response.status != .ok) return error.BadStatusCode; | ||||
|  | ||||
|         // TODO handle charset | ||||
|         // https://html.spec.whatwg.org/#content-type | ||||
|         const ct = result.headers.getFirstValue("Content-Type") orelse { | ||||
|         const ct = req.response.headers.getFirstValue("Content-Type") orelse { | ||||
|             // no content type in HTTP headers. | ||||
|             // TODO try to sniff mime type from the body. | ||||
|             log.info("no content-type HTTP header", .{}); | ||||
| @@ -199,16 +198,19 @@ pub const Page = struct { | ||||
|         const mime = try Mime.parse(ct); | ||||
|         if (mime.eql(Mime.HTML)) { | ||||
|             // TODO check content-type | ||||
|             try self.loadHTMLDoc(&result); | ||||
|             try self.loadHTMLDoc(req.reader()); | ||||
|         } else { | ||||
|             log.info("non-HTML document: {s}", .{ct}); | ||||
|  | ||||
|             // save the body into the page. | ||||
|             self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // https://html.spec.whatwg.org/#read-html | ||||
|     fn loadHTMLDoc(self: *Page, result: *FetchResult) !void { | ||||
|     fn loadHTMLDoc(self: *Page, reader: anytype) !void { | ||||
|         log.debug("parse html", .{}); | ||||
|         const html_doc = try parser.documentHTMLParseFromStr(result.body.?); | ||||
|         const html_doc = try parser.documentHTMLParse(reader); | ||||
|         const doc = parser.documentHTMLToDocument(html_doc); | ||||
|  | ||||
|         // save a document's pointer in the page. | ||||
|   | ||||
| @@ -6,10 +6,12 @@ pub const Loader = struct { | ||||
|     client: std.http.Client, | ||||
|  | ||||
|     pub const Response = struct { | ||||
|         req: std.http.Request, | ||||
|         allocator: std.mem.Allocator, | ||||
|         req: *std.http.Client.Request, | ||||
|  | ||||
|         pub fn deinit(self: *Response) void { | ||||
|             self.req.deinit(); | ||||
|             self.allocator.destroy(self.req); | ||||
|         } | ||||
|     }; | ||||
|  | ||||
| @@ -40,6 +42,36 @@ pub const Loader = struct { | ||||
|             .payload = .none, | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     // see | ||||
|     // https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch | ||||
|     // for reference. | ||||
|     // The caller is responsible for calling `deinit()` on the `Response`. | ||||
|     pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response { | ||||
|         var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{ | ||||
|             .{ .name = "User-Agent", .value = user_agent }, | ||||
|             .{ .name = "Accept", .value = "*/*" }, | ||||
|             .{ .name = "Accept-Language", .value = "en-US,en;q=0.5" }, | ||||
|         }); | ||||
|         defer headers.deinit(); | ||||
|  | ||||
|         var resp = Response{ | ||||
|             .allocator = allocator, | ||||
|             .req = try allocator.create(std.http.Client.Request), | ||||
|         }; | ||||
|         errdefer allocator.destroy(resp.req); | ||||
|  | ||||
|         resp.req.* = try self.client.open(.GET, uri, headers, .{ | ||||
|             .handle_redirects = true, // TODO handle redirects manually | ||||
|         }); | ||||
|         errdefer resp.req.deinit(); | ||||
|  | ||||
|         try resp.req.send(.{}); | ||||
|         try resp.req.finish(); | ||||
|         try resp.req.wait(); | ||||
|  | ||||
|         return resp; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| test "basic url fetch" { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Pierre Tachoire
					Pierre Tachoire