mirror of
				https://github.com/lightpanda-io/browser.git
				synced 2025-10-30 15:41:48 +00:00 
			
		
		
		
	loader: parse html per chunk
This commit is contained in:
		| @@ -152,11 +152,13 @@ pub const Page = struct { | |||||||
|  |  | ||||||
|     // dump writes the page content into the given file. |     // dump writes the page content into the given file. | ||||||
|     pub fn dump(self: *Page, out: std.fs.File) !void { |     pub fn dump(self: *Page, out: std.fs.File) !void { | ||||||
|         // no data loaded, nothin to do. |  | ||||||
|         if (self.raw_data == null) return; |  | ||||||
|  |  | ||||||
|         // if no HTML document pointer available, dump the data content only. |         // if no HTML document pointer available, dump the data content only. | ||||||
|         if (self.doc == null) return try out.writeAll(self.raw_data.?); |         if (self.doc == null) { | ||||||
|  |             // no data loaded, nothing to do. | ||||||
|  |             if (self.raw_data == null) return; | ||||||
|  |             return try out.writeAll(self.raw_data.?); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // if the page has a pointer to a document, dumps the HTML. |         // if the page has a pointer to a document, dumps the HTML. | ||||||
|         const root = try parser.documentGetDocumentElement(self.doc.?) orelse return; |         const root = try parser.documentGetDocumentElement(self.doc.?) orelse return; | ||||||
| @@ -175,22 +177,19 @@ pub const Page = struct { | |||||||
|         // TODO handle fragment in url. |         // TODO handle fragment in url. | ||||||
|  |  | ||||||
|         // load the data |         // load the data | ||||||
|         var result = try self.loader.fetch(self.allocator, self.uri); |         var resp = try self.loader.get(self.allocator, self.uri); | ||||||
|         defer result.deinit(); |         defer resp.deinit(); | ||||||
|  |  | ||||||
|         log.info("GET {any} {d}", .{ self.uri, result.status }); |         const req = resp.req; | ||||||
|  |  | ||||||
|  |         log.info("GET {any} {d}", .{ self.uri, req.response.status }); | ||||||
|  |  | ||||||
|         // TODO handle redirection |         // TODO handle redirection | ||||||
|         if (result.status != .ok) return error.BadStatusCode; |         if (req.response.status != .ok) return error.BadStatusCode; | ||||||
|  |  | ||||||
|         if (result.body == null) return error.NoBody; |  | ||||||
|  |  | ||||||
|         // save the body into the page. |  | ||||||
|         self.raw_data = try self.allocator.dupe(u8, result.body.?); |  | ||||||
|  |  | ||||||
|         // TODO handle charset |         // TODO handle charset | ||||||
|         // https://html.spec.whatwg.org/#content-type |         // https://html.spec.whatwg.org/#content-type | ||||||
|         const ct = result.headers.getFirstValue("Content-Type") orelse { |         const ct = req.response.headers.getFirstValue("Content-Type") orelse { | ||||||
|             // no content type in HTTP headers. |             // no content type in HTTP headers. | ||||||
|             // TODO try to sniff mime type from the body. |             // TODO try to sniff mime type from the body. | ||||||
|             log.info("no content-type HTTP header", .{}); |             log.info("no content-type HTTP header", .{}); | ||||||
| @@ -199,16 +198,19 @@ pub const Page = struct { | |||||||
|         const mime = try Mime.parse(ct); |         const mime = try Mime.parse(ct); | ||||||
|         if (mime.eql(Mime.HTML)) { |         if (mime.eql(Mime.HTML)) { | ||||||
|             // TODO check content-type |             // TODO check content-type | ||||||
|             try self.loadHTMLDoc(&result); |             try self.loadHTMLDoc(req.reader()); | ||||||
|         } else { |         } else { | ||||||
|             log.info("non-HTML document: {s}", .{ct}); |             log.info("non-HTML document: {s}", .{ct}); | ||||||
|  |  | ||||||
|  |             // save the body into the page. | ||||||
|  |             self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // https://html.spec.whatwg.org/#read-html |     // https://html.spec.whatwg.org/#read-html | ||||||
|     fn loadHTMLDoc(self: *Page, result: *FetchResult) !void { |     fn loadHTMLDoc(self: *Page, reader: anytype) !void { | ||||||
|         log.debug("parse html", .{}); |         log.debug("parse html", .{}); | ||||||
|         const html_doc = try parser.documentHTMLParseFromStr(result.body.?); |         const html_doc = try parser.documentHTMLParse(reader); | ||||||
|         const doc = parser.documentHTMLToDocument(html_doc); |         const doc = parser.documentHTMLToDocument(html_doc); | ||||||
|  |  | ||||||
|         // save a document's pointer in the page. |         // save a document's pointer in the page. | ||||||
|   | |||||||
| @@ -6,10 +6,12 @@ pub const Loader = struct { | |||||||
|     client: std.http.Client, |     client: std.http.Client, | ||||||
|  |  | ||||||
|     pub const Response = struct { |     pub const Response = struct { | ||||||
|         req: std.http.Request, |         allocator: std.mem.Allocator, | ||||||
|  |         req: *std.http.Client.Request, | ||||||
|  |  | ||||||
|         pub fn deinit(self: *Response) void { |         pub fn deinit(self: *Response) void { | ||||||
|             self.req.deinit(); |             self.req.deinit(); | ||||||
|  |             self.allocator.destroy(self.req); | ||||||
|         } |         } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| @@ -40,6 +42,36 @@ pub const Loader = struct { | |||||||
|             .payload = .none, |             .payload = .none, | ||||||
|         }); |         }); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // see | ||||||
|  |     // https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch | ||||||
|  |     // for reference. | ||||||
|  |     // The caller is responsible for calling `deinit()` on the `Response`. | ||||||
|  |     pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response { | ||||||
|  |         var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{ | ||||||
|  |             .{ .name = "User-Agent", .value = user_agent }, | ||||||
|  |             .{ .name = "Accept", .value = "*/*" }, | ||||||
|  |             .{ .name = "Accept-Language", .value = "en-US,en;q=0.5" }, | ||||||
|  |         }); | ||||||
|  |         defer headers.deinit(); | ||||||
|  |  | ||||||
|  |         var resp = Response{ | ||||||
|  |             .allocator = allocator, | ||||||
|  |             .req = try allocator.create(std.http.Client.Request), | ||||||
|  |         }; | ||||||
|  |         errdefer allocator.destroy(resp.req); | ||||||
|  |  | ||||||
|  |         resp.req.* = try self.client.open(.GET, uri, headers, .{ | ||||||
|  |             .handle_redirects = true, // TODO handle redirects manually | ||||||
|  |         }); | ||||||
|  |         errdefer resp.req.deinit(); | ||||||
|  |  | ||||||
|  |         try resp.req.send(.{}); | ||||||
|  |         try resp.req.finish(); | ||||||
|  |         try resp.req.wait(); | ||||||
|  |  | ||||||
|  |         return resp; | ||||||
|  |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| test "basic url fetch" { | test "basic url fetch" { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Pierre Tachoire
					Pierre Tachoire