Ada 3.4.3
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern-inl.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_INL_H
6#define ADA_URL_PATTERN_INL_H
7
8#include "ada/common_defs.h"
10#include "ada/url_pattern.h"
11
12#include <algorithm>
13#include <string_view>
14#include <utility>
15
16#if ADA_INCLUDE_URL_PATTERN
17namespace ada {
18
19inline bool url_pattern_init::operator==(const url_pattern_init& other) const {
20 return protocol == other.protocol && username == other.username &&
21 password == other.password && hostname == other.hostname &&
22 port == other.port && search == other.search && hash == other.hash &&
23 pathname == other.pathname;
24}
25
26inline bool url_pattern_component_result::operator==(
27 const url_pattern_component_result& other) const {
28 return input == other.input && groups == other.groups;
29}
30
31template <url_pattern_regex::regex_concept regex_provider>
32url_pattern_component_result
33url_pattern_component<regex_provider>::create_component_match_result(
34 std::string&& input,
35 std::vector<std::optional<std::string>>&& exec_result) {
36 // Let result be a new URLPatternComponentResult.
37 // Set result["input"] to input.
38 // Let groups be a record<USVString, (USVString or undefined)>.
39 auto result =
40 url_pattern_component_result{.input = std::move(input), .groups = {}};
41
42 // We explicitly start iterating from 0 even though the spec
43 // says we should start from 1. This case is handled by the
44 // std_regex_provider which removes the full match from index 0.
45 // Use min() to guard against potential mismatches between
46 // exec_result size and group_name_list size.
47 const size_t size = std::min(exec_result.size(), group_name_list.size());
48 result.groups.reserve(size);
49 for (size_t index = 0; index < size; index++) {
50 result.groups.emplace(group_name_list[index],
51 std::move(exec_result[index]));
52 }
53 return result;
54}
55
56template <url_pattern_regex::regex_concept regex_provider>
57std::string_view url_pattern<regex_provider>::get_protocol() const
59 // Return this's associated URL pattern's protocol component's pattern string.
60 return protocol_component.pattern;
61}
62template <url_pattern_regex::regex_concept regex_provider>
63std::string_view url_pattern<regex_provider>::get_username() const
65 // Return this's associated URL pattern's username component's pattern string.
66 return username_component.pattern;
67}
68template <url_pattern_regex::regex_concept regex_provider>
69std::string_view url_pattern<regex_provider>::get_password() const
71 // Return this's associated URL pattern's password component's pattern string.
72 return password_component.pattern;
73}
74template <url_pattern_regex::regex_concept regex_provider>
75std::string_view url_pattern<regex_provider>::get_hostname() const
77 // Return this's associated URL pattern's hostname component's pattern string.
78 return hostname_component.pattern;
79}
80template <url_pattern_regex::regex_concept regex_provider>
81std::string_view url_pattern<regex_provider>::get_port() const
83 // Return this's associated URL pattern's port component's pattern string.
84 return port_component.pattern;
85}
86template <url_pattern_regex::regex_concept regex_provider>
87std::string_view url_pattern<regex_provider>::get_pathname() const
89 // Return this's associated URL pattern's pathname component's pattern string.
90 return pathname_component.pattern;
91}
92template <url_pattern_regex::regex_concept regex_provider>
93std::string_view url_pattern<regex_provider>::get_search() const
95 // Return this's associated URL pattern's search component's pattern string.
96 return search_component.pattern;
97}
98template <url_pattern_regex::regex_concept regex_provider>
99std::string_view url_pattern<regex_provider>::get_hash() const
101 // Return this's associated URL pattern's hash component's pattern string.
102 return hash_component.pattern;
103}
104template <url_pattern_regex::regex_concept regex_provider>
105bool url_pattern<regex_provider>::ignore_case() const {
106 return ignore_case_;
107}
108template <url_pattern_regex::regex_concept regex_provider>
109bool url_pattern<regex_provider>::has_regexp_groups() const {
110 // If this's associated URL pattern's has regexp groups, then return true.
111 return protocol_component.has_regexp_groups ||
112 username_component.has_regexp_groups ||
113 password_component.has_regexp_groups ||
114 hostname_component.has_regexp_groups ||
115 port_component.has_regexp_groups ||
116 pathname_component.has_regexp_groups ||
117 search_component.has_regexp_groups || hash_component.has_regexp_groups;
118}
119
120inline bool url_pattern_part::is_regexp() const noexcept {
121 return type == url_pattern_part_type::REGEXP;
122}
123
124inline std::string_view url_pattern_compile_component_options::get_delimiter()
125 const {
126 if (delimiter) {
127 return {&delimiter.value(), 1};
128 }
129 return {};
130}
131
132inline std::string_view url_pattern_compile_component_options::get_prefix()
133 const {
134 if (prefix) {
135 return {&prefix.value(), 1};
136 }
137 return {};
138}
139
140template <url_pattern_regex::regex_concept regex_provider>
141template <url_pattern_encoding_callback F>
142tl::expected<url_pattern_component<regex_provider>, errors>
143url_pattern_component<regex_provider>::compile(
144 std::string_view input, F& encoding_callback,
145 url_pattern_compile_component_options& options) {
146 ada_log("url_pattern_component::compile input: ", input);
147 // Let part list be the result of running parse a pattern string given input,
148 // options, and encoding callback.
149 auto part_list = url_pattern_helpers::parse_pattern_string(input, options,
150 encoding_callback);
151
152 if (!part_list) {
153 ada_log("parse_pattern_string failed");
154 return tl::unexpected(part_list.error());
155 }
156
157 // Detect pattern type early to potentially skip expensive regex compilation
158 const auto has_regexp = [](const auto& part) { return part.is_regexp(); };
159 const bool has_regexp_groups = std::ranges::any_of(*part_list, has_regexp);
160
161 url_pattern_component_type component_type =
162 url_pattern_component_type::REGEXP;
163 std::string exact_match_value{};
164
165 if (part_list->empty()) {
166 component_type = url_pattern_component_type::EMPTY;
167 } else if (part_list->size() == 1) {
168 const auto& part = (*part_list)[0];
169 if (part.type == url_pattern_part_type::FIXED_TEXT &&
170 part.modifier == url_pattern_part_modifier::none &&
171 !options.ignore_case) {
172 component_type = url_pattern_component_type::EXACT_MATCH;
173 exact_match_value = part.value;
174 } else if (part.type == url_pattern_part_type::FULL_WILDCARD &&
175 part.modifier == url_pattern_part_modifier::none &&
176 part.prefix.empty() && part.suffix.empty()) {
177 component_type = url_pattern_component_type::FULL_WILDCARD;
178 }
179 }
180
181 // For simple patterns, skip regex generation and compilation entirely
182 if (component_type != url_pattern_component_type::REGEXP) {
183 auto pattern_string =
184 url_pattern_helpers::generate_pattern_string(*part_list, options);
185 // For FULL_WILDCARD, we need the group name from
186 // generate_regular_expression
187 std::vector<std::string> name_list;
188 if (component_type == url_pattern_component_type::FULL_WILDCARD &&
189 !part_list->empty()) {
190 name_list.push_back((*part_list)[0].name);
191 }
192 return url_pattern_component<regex_provider>(
193 std::move(pattern_string), typename regex_provider::regex_type{},
194 std::move(name_list), has_regexp_groups, component_type,
195 std::move(exact_match_value));
196 }
197
198 // Generate regex for complex patterns
199 auto [regular_expression_string, name_list] =
200 url_pattern_helpers::generate_regular_expression_and_name_list(*part_list,
201 options);
202 auto pattern_string =
203 url_pattern_helpers::generate_pattern_string(*part_list, options);
204
205 std::optional<typename regex_provider::regex_type> regular_expression =
206 regex_provider::create_instance(regular_expression_string,
207 options.ignore_case);
208 if (!regular_expression) {
209 return tl::unexpected(errors::type_error);
210 }
211
212 return url_pattern_component<regex_provider>(
213 std::move(pattern_string), std::move(*regular_expression),
214 std::move(name_list), has_regexp_groups, component_type,
215 std::move(exact_match_value));
216}
217
218template <url_pattern_regex::regex_concept regex_provider>
219bool url_pattern_component<regex_provider>::fast_test(
220 std::string_view input) const noexcept {
221 // Fast path for simple patterns - avoid regex evaluation
222 // Using if-else for better branch prediction on common cases
223 if (type == url_pattern_component_type::FULL_WILDCARD) {
224 return true;
225 }
226 if (type == url_pattern_component_type::EXACT_MATCH) {
227 return input == exact_match_value;
228 }
229 if (type == url_pattern_component_type::EMPTY) {
230 return input.empty();
231 }
232 // type == REGEXP
233 return regex_provider::regex_match(input, regexp);
234}
235
236template <url_pattern_regex::regex_concept regex_provider>
237std::optional<std::vector<std::optional<std::string>>>
238url_pattern_component<regex_provider>::fast_match(
239 std::string_view input) const {
240 // Handle each type directly without redundant checks
241 if (type == url_pattern_component_type::FULL_WILDCARD) {
242 // FULL_WILDCARD always matches - capture the input (even if empty)
243 // If there's no group name, return empty groups
244 if (group_name_list.empty()) {
245 return std::vector<std::optional<std::string>>{};
246 }
247 // Capture the matched input (including empty strings)
248 return std::vector<std::optional<std::string>>{std::string(input)};
249 }
250 if (type == url_pattern_component_type::EXACT_MATCH) {
251 if (input == exact_match_value) {
252 return std::vector<std::optional<std::string>>{};
253 }
254 return std::nullopt;
255 }
256 if (type == url_pattern_component_type::EMPTY) {
257 if (input.empty()) {
258 return std::vector<std::optional<std::string>>{};
259 }
260 return std::nullopt;
261 }
262 // type == REGEXP - use regex
263 return regex_provider::regex_search(input, regexp);
264}
265
266template <url_pattern_regex::regex_concept regex_provider>
267result<std::optional<url_pattern_result>> url_pattern<regex_provider>::exec(
268 const url_pattern_input& input, const std::string_view* base_url) {
269 // Return the result of match given this's associated URL pattern, input, and
270 // baseURL if given.
271 return match(input, base_url);
272}
273
274template <url_pattern_regex::regex_concept regex_provider>
275bool url_pattern<regex_provider>::test_components(
276 std::string_view protocol, std::string_view username,
277 std::string_view password, std::string_view hostname, std::string_view port,
278 std::string_view pathname, std::string_view search,
279 std::string_view hash) const {
280 return protocol_component.fast_test(protocol) &&
281 username_component.fast_test(username) &&
282 password_component.fast_test(password) &&
283 hostname_component.fast_test(hostname) &&
284 port_component.fast_test(port) &&
285 pathname_component.fast_test(pathname) &&
286 search_component.fast_test(search) && hash_component.fast_test(hash);
287}
288
289template <url_pattern_regex::regex_concept regex_provider>
290result<bool> url_pattern<regex_provider>::test(
291 const url_pattern_input& input, const std::string_view* base_url_string) {
292 // If input is a URLPatternInit
293 if (std::holds_alternative<url_pattern_init>(input)) {
294 if (base_url_string) {
295 return tl::unexpected(errors::type_error);
296 }
297
298 std::string protocol{}, username{}, password{}, hostname{};
299 std::string port{}, pathname{}, search{}, hash{};
300
301 auto apply_result = url_pattern_init::process(
302 std::get<url_pattern_init>(input), url_pattern_init::process_type::url,
303 protocol, username, password, hostname, port, pathname, search, hash);
304
305 if (!apply_result) {
306 return false;
307 }
308
309 std::string_view search_view = *apply_result->search;
310 if (search_view.starts_with("?")) {
311 search_view.remove_prefix(1);
312 }
313
314 return test_components(*apply_result->protocol, *apply_result->username,
315 *apply_result->password, *apply_result->hostname,
316 *apply_result->port, *apply_result->pathname,
317 search_view, *apply_result->hash);
318 }
319
320 // URL string input path
321 result<url_aggregator> base_url;
322 if (base_url_string) {
323 base_url = ada::parse<url_aggregator>(*base_url_string, nullptr);
324 if (!base_url) {
325 return false;
326 }
327 }
328
329 auto url =
330 ada::parse<url_aggregator>(std::get<std::string_view>(input),
331 base_url.has_value() ? &*base_url : nullptr);
332 if (!url) {
333 return false;
334 }
335
336 // Extract components as string_view
337 auto protocol_view = url->get_protocol();
338 if (protocol_view.ends_with(":")) {
339 protocol_view.remove_suffix(1);
340 }
341
342 auto search_view = url->get_search();
343 if (search_view.starts_with("?")) {
344 search_view.remove_prefix(1);
345 }
346
347 auto hash_view = url->get_hash();
348 if (hash_view.starts_with("#")) {
349 hash_view.remove_prefix(1);
350 }
351
352 return test_components(protocol_view, url->get_username(),
354 url->get_port(), url->get_pathname(), search_view,
355 hash_view);
356}
357
358template <url_pattern_regex::regex_concept regex_provider>
359result<std::optional<url_pattern_result>> url_pattern<regex_provider>::match(
360 const url_pattern_input& input, const std::string_view* base_url_string) {
361 std::string protocol{};
362 std::string username{};
363 std::string password{};
364 std::string hostname{};
365 std::string port{};
366 std::string pathname{};
367 std::string search{};
368 std::string hash{};
369
370 // Let inputs be an empty list.
371 // Append input to inputs.
372 std::vector inputs{input};
373
374 // If input is a URLPatternInit then:
375 if (std::holds_alternative<url_pattern_init>(input)) {
376 ada_log(
377 "url_pattern::match called with url_pattern_init and base_url_string=",
378 base_url_string);
379 // If baseURLString was given, throw a TypeError.
380 if (base_url_string) {
381 ada_log("failed to match because base_url_string was given");
382 return tl::unexpected(errors::type_error);
383 }
384
385 // Let applyResult be the result of process a URLPatternInit given input,
386 // "url", protocol, username, password, hostname, port, pathname, search,
387 // and hash.
388 auto apply_result = url_pattern_init::process(
389 std::get<url_pattern_init>(input), url_pattern_init::process_type::url,
390 protocol, username, password, hostname, port, pathname, search, hash);
391
392 // If this throws an exception, catch it, and return null.
393 if (!apply_result.has_value()) {
394 ada_log("match returned std::nullopt because process threw");
395 return std::nullopt;
396 }
397
398 // Set protocol to applyResult["protocol"].
399 ADA_ASSERT_TRUE(apply_result->protocol.has_value());
400 protocol = std::move(apply_result->protocol.value());
401
402 // Set username to applyResult["username"].
403 ADA_ASSERT_TRUE(apply_result->username.has_value());
404 username = std::move(apply_result->username.value());
405
406 // Set password to applyResult["password"].
407 ADA_ASSERT_TRUE(apply_result->password.has_value());
408 password = std::move(apply_result->password.value());
409
410 // Set hostname to applyResult["hostname"].
411 ADA_ASSERT_TRUE(apply_result->hostname.has_value());
412 hostname = std::move(apply_result->hostname.value());
413
414 // Set port to applyResult["port"].
415 ADA_ASSERT_TRUE(apply_result->port.has_value());
416 port = std::move(apply_result->port.value());
417
418 // Set pathname to applyResult["pathname"].
419 ADA_ASSERT_TRUE(apply_result->pathname.has_value());
420 pathname = std::move(apply_result->pathname.value());
421
422 // Set search to applyResult["search"].
423 ADA_ASSERT_TRUE(apply_result->search.has_value());
424 if (apply_result->search->starts_with("?")) {
425 search = apply_result->search->substr(1);
426 } else {
427 search = std::move(apply_result->search.value());
428 }
429
430 // Set hash to applyResult["hash"].
431 ADA_ASSERT_TRUE(apply_result->hash.has_value());
432 ADA_ASSERT_TRUE(!apply_result->hash->starts_with("#"));
433 hash = std::move(apply_result->hash.value());
434 } else {
435 ADA_ASSERT_TRUE(std::holds_alternative<std::string_view>(input));
436
437 // Let baseURL be null.
438 result<url_aggregator> base_url;
439
440 // If baseURLString was given, then:
441 if (base_url_string) {
442 // Let baseURL be the result of parsing baseURLString.
443 base_url = ada::parse<url_aggregator>(*base_url_string, nullptr);
444
445 // If baseURL is failure, return null.
446 if (!base_url) {
447 ada_log("match returned std::nullopt because failed to parse base_url=",
448 *base_url_string);
449 return std::nullopt;
450 }
451
452 // Append baseURLString to inputs.
453 inputs.emplace_back(*base_url_string);
454 }
455
456 url_aggregator* base_url_value =
457 base_url.has_value() ? &*base_url : nullptr;
458
459 // Set url to the result of parsing input given baseURL.
460 auto url = ada::parse<url_aggregator>(std::get<std::string_view>(input),
461 base_url_value);
462
463 // If url is failure, return null.
464 if (!url) {
465 ada_log("match returned std::nullopt because url failed");
466 return std::nullopt;
467 }
468
469 // Set protocol to url's scheme.
470 // IMPORTANT: Not documented on the URLPattern spec, but protocol suffix ':'
471 // is removed. Similar work was done on workerd:
472 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2038
473 protocol = url->get_protocol().substr(0, url->get_protocol().size() - 1);
474 // Set username to url's username.
475 username = url->get_username();
476 // Set password to url's password.
477 password = url->get_password();
478 // Set hostname to url's host, serialized, or the empty string if the value
479 // is null.
480 hostname = url->get_hostname();
481 // Set port to url's port, serialized, or the empty string if the value is
482 // null.
483 port = url->get_port();
484 // Set pathname to the result of URL path serializing url.
485 pathname = url->get_pathname();
486 // Set search to url's query or the empty string if the value is null.
487 // IMPORTANT: Not documented on the URLPattern spec, but search prefix '?'
488 // is removed. Similar work was done on workerd:
489 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2232
490 if (url->has_search()) {
491 auto view = url->get_search();
492 search = view.starts_with("?") ? url->get_search().substr(1) : view;
493 }
494 // Set hash to url's fragment or the empty string if the value is null.
495 // IMPORTANT: Not documented on the URLPattern spec, but hash prefix '#' is
496 // removed. Similar work was done on workerd:
497 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2242
498 if (url->has_hash()) {
499 auto view = url->get_hash();
500 hash = view.starts_with("#") ? url->get_hash().substr(1) : view;
501 }
502 }
503
504 // Use fast_match which skips regex for simple patterns (EMPTY, EXACT_MATCH,
505 // FULL_WILDCARD) and only falls back to regex for complex REGEXP patterns.
506
507 // Let protocolExecResult be RegExpBuiltinExec(urlPattern's protocol
508 // component's regular expression, protocol).
509 auto protocol_exec_result = protocol_component.fast_match(protocol);
510 if (!protocol_exec_result) {
511 return std::nullopt;
512 }
513
514 // Let usernameExecResult be RegExpBuiltinExec(urlPattern's username
515 // component's regular expression, username).
516 auto username_exec_result = username_component.fast_match(username);
517 if (!username_exec_result) {
518 return std::nullopt;
519 }
520
521 // Let passwordExecResult be RegExpBuiltinExec(urlPattern's password
522 // component's regular expression, password).
523 auto password_exec_result = password_component.fast_match(password);
524 if (!password_exec_result) {
525 return std::nullopt;
526 }
527
528 // Let hostnameExecResult be RegExpBuiltinExec(urlPattern's hostname
529 // component's regular expression, hostname).
530 auto hostname_exec_result = hostname_component.fast_match(hostname);
531 if (!hostname_exec_result) {
532 return std::nullopt;
533 }
534
535 // Let portExecResult be RegExpBuiltinExec(urlPattern's port component's
536 // regular expression, port).
537 auto port_exec_result = port_component.fast_match(port);
538 if (!port_exec_result) {
539 return std::nullopt;
540 }
541
542 // Let pathnameExecResult be RegExpBuiltinExec(urlPattern's pathname
543 // component's regular expression, pathname).
544 auto pathname_exec_result = pathname_component.fast_match(pathname);
545 if (!pathname_exec_result) {
546 return std::nullopt;
547 }
548
549 // Let searchExecResult be RegExpBuiltinExec(urlPattern's search component's
550 // regular expression, search).
551 auto search_exec_result = search_component.fast_match(search);
552 if (!search_exec_result) {
553 return std::nullopt;
554 }
555
556 // Let hashExecResult be RegExpBuiltinExec(urlPattern's hash component's
557 // regular expression, hash).
558 auto hash_exec_result = hash_component.fast_match(hash);
559 if (!hash_exec_result) {
560 return std::nullopt;
561 }
562
563 // Let result be a new URLPatternResult.
564 auto result = url_pattern_result{};
565 // Set result["inputs"] to inputs.
566 result.inputs = std::move(inputs);
567 // Set result["protocol"] to the result of creating a component match result
568 // given urlPattern's protocol component, protocol, and protocolExecResult.
569 result.protocol = protocol_component.create_component_match_result(
570 std::move(protocol), std::move(*protocol_exec_result));
571
572 // Set result["username"] to the result of creating a component match result
573 // given urlPattern's username component, username, and usernameExecResult.
574 result.username = username_component.create_component_match_result(
575 std::move(username), std::move(*username_exec_result));
576
577 // Set result["password"] to the result of creating a component match result
578 // given urlPattern's password component, password, and passwordExecResult.
579 result.password = password_component.create_component_match_result(
580 std::move(password), std::move(*password_exec_result));
581
582 // Set result["hostname"] to the result of creating a component match result
583 // given urlPattern's hostname component, hostname, and hostnameExecResult.
584 result.hostname = hostname_component.create_component_match_result(
585 std::move(hostname), std::move(*hostname_exec_result));
586
587 // Set result["port"] to the result of creating a component match result given
588 // urlPattern's port component, port, and portExecResult.
589 result.port = port_component.create_component_match_result(
590 std::move(port), std::move(*port_exec_result));
591
592 // Set result["pathname"] to the result of creating a component match result
593 // given urlPattern's pathname component, pathname, and pathnameExecResult.
594 result.pathname = pathname_component.create_component_match_result(
595 std::move(pathname), std::move(*pathname_exec_result));
596
597 // Set result["search"] to the result of creating a component match result
598 // given urlPattern's search component, search, and searchExecResult.
599 result.search = search_component.create_component_match_result(
600 std::move(search), std::move(*search_exec_result));
601
602 // Set result["hash"] to the result of creating a component match result given
603 // urlPattern's hash component, hash, and hashExecResult.
604 result.hash = hash_component.create_component_match_result(
605 std::move(hash), std::move(*hash_exec_result));
606
607 return result;
608}
609
610} // namespace ada
611#endif // ADA_INCLUDE_URL_PATTERN
612#endif
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
type
Enumeration of URL scheme types.
Definition scheme.h:41
Definition ada_idna.h:13
errors
Error codes for URL parsing operations.
Definition errors.h:17
@ type_error
Definition errors.h:18
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
Memory-efficient URL representation using a single buffer.
Represents a parsed URL with individual string components.
Definition url.h:62
std::string get_search() const
Definition url.cpp:659
std::string get_port() const
Definition url.cpp:674
std::string get_protocol() const
Definition url.cpp:633
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hostname() const
Definition url.cpp:655
const std::string & get_password() const noexcept
Definition url.cpp:670
const std::string & get_username() const noexcept
Definition url.cpp:666
constexpr bool has_search() const noexcept override
Definition url-inl.h:164
std::string get_hash() const
Definition url.cpp:678
constexpr bool has_hash() const noexcept override
Definition url-inl.h:160
URLPattern API implementation.
Declaration for the URLPattern helpers.