Skip to content

Commit c2f4494

Browse files
authored
feat: fast matcher for common patterns (#32)
This commit adds a fast path matcher for common patterns. This fast path uses literal string comparisons and other simple heurisitics when possible, but falls back to a full regex match if the simple match patterns do not suffice (for example when backtracing is necessary).
1 parent 725f1f7 commit c2f4494

File tree

6 files changed

+284
-89
lines changed

6 files changed

+284
-89
lines changed

.github/workflows/ci.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ jobs:
1515
- name: Checkout repo
1616
uses: actions/checkout@v2
1717

18-
- name: Install rust
19-
uses: hecrj/setup-rust-action@v1.3.4
18+
- name: Install Rust
19+
uses: dtolnay/rust-toolchain@stable
2020
with:
21-
rust-version: 1.55.0
21+
rust-version: 1.60.0
2222
components: clippy,rustfmt
2323

2424
- name: Format

src/component.rs

+99-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
// Copyright 2018-2021 the Deno authors. All rights reserved. MIT license.
22

3+
use crate::matcher::InnerMatcher;
4+
use crate::matcher::Matcher;
35
use crate::parser::Options;
46
use crate::parser::Part;
57
use crate::parser::PartModifier;
@@ -15,6 +17,7 @@ pub(crate) struct Component<R: RegExp> {
1517
pub pattern_string: String,
1618
pub regexp: Result<R, Error>,
1719
pub group_name_list: Vec<String>,
20+
pub matcher: Matcher<R>,
1821
}
1922

2023
impl<R: RegExp> Component<R> {
@@ -32,14 +35,17 @@ impl<R: RegExp> Component<R> {
3235
&options,
3336
encoding_callback,
3437
)?;
38+
let part_list = part_list.iter().collect::<Vec<_>>();
3539
let (regexp_string, name_list) =
3640
generate_regular_expression_and_name_list(&part_list, &options);
3741
let regexp = R::parse(&regexp_string).map_err(Error::RegExp);
38-
let pattern_string = generate_pattern_string(part_list, &options);
42+
let pattern_string = generate_pattern_string(&part_list, &options);
43+
let matcher = generate_matcher::<R>(&part_list, &options);
3944
Ok(Component {
4045
pattern_string,
4146
regexp,
4247
group_name_list: name_list,
48+
matcher,
4349
})
4450
}
4551

@@ -85,7 +91,7 @@ impl<R: RegExp> Component<R> {
8591

8692
// Ref: https://wicg.github.io/urlpattern/#generate-a-regular-expression-and-name-list
8793
fn generate_regular_expression_and_name_list(
88-
part_list: &[Part],
94+
part_list: &[&Part],
8995
options: &Options,
9096
) -> (String, Vec<String>) {
9197
let mut result = String::from("^");
@@ -153,12 +159,15 @@ fn generate_regular_expression_and_name_list(
153159
}
154160

155161
// Ref: https://wicg.github.io/urlpattern/#generate-a-pattern-string
156-
fn generate_pattern_string(part_list: Vec<Part>, options: &Options) -> String {
162+
fn generate_pattern_string(part_list: &[&Part], options: &Options) -> String {
157163
let mut result = String::new();
158164
for (i, part) in part_list.iter().enumerate() {
159-
let prev_part: Option<&Part> =
160-
if i == 0 { None } else { part_list.get(i - 1) };
161-
let next_part: Option<&Part> = part_list.get(i + 1);
165+
let prev_part: Option<&Part> = if i == 0 {
166+
None
167+
} else {
168+
part_list.get(i - 1).copied()
169+
};
170+
let next_part: Option<&Part> = part_list.get(i + 1).copied();
162171
if part.kind == PartType::FixedText {
163172
if part.modifier == PartModifier::None {
164173
result.push_str(&escape_pattern_string(&part.value));
@@ -261,3 +270,87 @@ fn escape_pattern_string(input: &str) -> String {
261270
}
262271
result
263272
}
273+
274+
/// This function generates a matcher for a given parts list.
275+
fn generate_matcher<R: RegExp>(
276+
mut part_list: &[&Part],
277+
options: &Options,
278+
) -> Matcher<R> {
279+
fn is_literal(part: &Part) -> bool {
280+
part.kind == PartType::FixedText && part.modifier == PartModifier::None
281+
}
282+
283+
// If the first part is a fixed string, we can use it as a literal prefix.
284+
let mut prefix = match part_list.first() {
285+
Some(part) if is_literal(part) => {
286+
part_list = &part_list[1..];
287+
part.value.clone()
288+
}
289+
_ => "".into(),
290+
};
291+
// If the last part is a fixed string, we can use it as a literal suffix.
292+
let mut suffix = match part_list.last() {
293+
Some(part) if is_literal(part) => {
294+
part_list = &part_list[..part_list.len() - 1];
295+
part.value.clone()
296+
}
297+
_ => "".into(),
298+
};
299+
300+
// If there are no more parts, we must have a prefix and/or a suffix. We can
301+
// combine these into a single fixed text literal matcher.
302+
if part_list.is_empty() {
303+
return Matcher::literal(format!("{prefix}{suffix}"));
304+
}
305+
306+
let inner = match part_list {
307+
// If there is only one part, and it is a simple full wildcard with no
308+
// prefix or suffix, we can use a simple wildcard matcher.
309+
[part]
310+
if part.kind == PartType::FullWildcard
311+
&& part.modifier == PartModifier::None =>
312+
{
313+
prefix += &part.prefix;
314+
if !part.suffix.is_empty() {
315+
suffix = format!("{}{suffix}", part.suffix);
316+
}
317+
InnerMatcher::SingleCapture {
318+
filter: None,
319+
allow_empty: true,
320+
}
321+
}
322+
// If there is only one part, and it is a simple segment wildcard with no
323+
// prefix or suffix, we can use a simple wildcard matcher.
324+
[part]
325+
if part.kind == PartType::SegmentWildcard
326+
&& part.modifier == PartModifier::None =>
327+
{
328+
prefix += &part.prefix;
329+
if !part.suffix.is_empty() {
330+
suffix = format!("{}{suffix}", part.suffix);
331+
}
332+
let filter = if options.delimiter_code_point.is_empty() {
333+
None
334+
} else {
335+
Some(options.delimiter_code_point.clone())
336+
};
337+
InnerMatcher::SingleCapture {
338+
filter,
339+
allow_empty: false,
340+
}
341+
}
342+
// For all other cases, we fall back to a regexp matcher.
343+
part_list => {
344+
let (regexp_string, _) =
345+
generate_regular_expression_and_name_list(part_list, options);
346+
let regexp = R::parse(&regexp_string).map_err(Error::RegExp);
347+
InnerMatcher::RegExp { regexp }
348+
}
349+
};
350+
351+
Matcher {
352+
prefix,
353+
suffix,
354+
inner,
355+
}
356+
}

src/lib.rs

+9-36
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ mod canonicalize_and_process;
99
mod component;
1010
mod constructor_parser;
1111
mod error;
12+
mod matcher;
1213
mod parser;
1314
pub mod quirks;
1415
mod regexp;
@@ -405,42 +406,14 @@ impl<R: RegExp> UrlPattern<R> {
405406
None => return Ok(None),
406407
};
407408

408-
let protocol_exec_result = self
409-
.protocol
410-
.regexp
411-
.as_ref()
412-
.unwrap()
413-
.matches(&input.protocol);
414-
let username_exec_result = self
415-
.username
416-
.regexp
417-
.as_ref()
418-
.unwrap()
419-
.matches(&input.username);
420-
let password_exec_result = self
421-
.password
422-
.regexp
423-
.as_ref()
424-
.unwrap()
425-
.matches(&input.password);
426-
let hostname_exec_result = self
427-
.hostname
428-
.regexp
429-
.as_ref()
430-
.unwrap()
431-
.matches(&input.hostname);
432-
let port_exec_result =
433-
self.port.regexp.as_ref().unwrap().matches(&input.port);
434-
let pathname_exec_result = self
435-
.pathname
436-
.regexp
437-
.as_ref()
438-
.unwrap()
439-
.matches(&input.pathname);
440-
let search_exec_result =
441-
self.search.regexp.as_ref().unwrap().matches(&input.search);
442-
let hash_exec_result =
443-
self.hash.regexp.as_ref().unwrap().matches(&input.hash);
409+
let protocol_exec_result = self.protocol.matcher.matches(&input.protocol);
410+
let username_exec_result = self.username.matcher.matches(&input.username);
411+
let password_exec_result = self.password.matcher.matches(&input.password);
412+
let hostname_exec_result = self.hostname.matcher.matches(&input.hostname);
413+
let port_exec_result = self.port.matcher.matches(&input.port);
414+
let pathname_exec_result = self.pathname.matcher.matches(&input.pathname);
415+
let search_exec_result = self.search.matcher.matches(&input.search);
416+
let hash_exec_result = self.hash.matcher.matches(&input.hash);
444417

445418
match (
446419
protocol_exec_result,

src/matcher.rs

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
use crate::regexp::RegExp;
2+
use crate::Error;
3+
4+
#[derive(Debug)]
5+
/// A structured representation of a URLPattern matcher, which can be used to
6+
/// match a URL against a pattern quickly.
7+
pub(crate) struct Matcher<R: RegExp> {
8+
pub prefix: String,
9+
pub suffix: String,
10+
pub inner: InnerMatcher<R>,
11+
}
12+
13+
#[derive(Debug)]
14+
pub(crate) enum InnerMatcher<R: RegExp> {
15+
/// A literal string matcher.
16+
///
17+
/// # Examples
18+
/// - /
19+
/// - /foo
20+
Literal { literal: String },
21+
/// A matcher that matches all chars, except the substring specified in
22+
/// `filter` (if it is set).
23+
///
24+
/// # Examples
25+
/// - *
26+
/// - /old/*
27+
/// - /scripts/*.js
28+
/// - /:slug
29+
/// - /blog/:id
30+
/// - /blog/:id.html
31+
SingleCapture {
32+
filter: Option<String>,
33+
allow_empty: bool,
34+
},
35+
/// A regexp matcher. This is a bail-out matcher for arbitrary complexity
36+
/// matchers.
37+
///
38+
/// # Examples
39+
/// - /foo/:id?
40+
RegExp { regexp: Result<R, Error> },
41+
}
42+
43+
impl<R: RegExp> Matcher<R> {
44+
pub(crate) fn literal(literal: String) -> Self {
45+
Matcher {
46+
prefix: "".to_string(),
47+
suffix: "".to_string(),
48+
inner: InnerMatcher::Literal { literal },
49+
}
50+
}
51+
52+
pub fn matches<'a>(&self, mut input: &'a str) -> Option<Vec<&'a str>> {
53+
let prefix_len = self.prefix.len();
54+
let suffix_len = self.suffix.len();
55+
let input_len = input.len();
56+
if prefix_len + suffix_len > 0 {
57+
// The input must be at least as long as the prefix and suffix combined,
58+
// because these must both be present, and not overlap.
59+
if input_len < prefix_len + suffix_len {
60+
return None;
61+
}
62+
if !input.starts_with(&self.prefix) {
63+
return None;
64+
}
65+
if !input.ends_with(&self.suffix) {
66+
return None;
67+
}
68+
input = &input[prefix_len..input_len - suffix_len];
69+
}
70+
71+
match &self.inner {
72+
InnerMatcher::Literal { literal } => (input == literal).then(Vec::new),
73+
InnerMatcher::SingleCapture {
74+
filter,
75+
allow_empty,
76+
} => {
77+
if input.is_empty() && !allow_empty {
78+
return None;
79+
}
80+
if let Some(filter) = filter {
81+
if input.contains(filter) {
82+
return None;
83+
}
84+
}
85+
Some(vec![input])
86+
}
87+
InnerMatcher::RegExp { regexp, .. } => {
88+
regexp.as_ref().unwrap().matches(input)
89+
}
90+
}
91+
}
92+
}

src/parser.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ pub enum RegexSyntax {
2222
}
2323

2424
// Ref: https://wicg.github.io/urlpattern/#options-header
25-
#[derive(Debug)]
25+
#[derive(Debug, Clone)]
2626
pub struct Options {
27-
delimiter_code_point: String, // TODO: It must contain one ASCII code point or the empty string. maybe Option<char>?
27+
pub delimiter_code_point: String, // TODO: It must contain one ASCII code point or the empty string. maybe Option<char>?
2828
pub prefix_code_point: String, // TODO: It must contain one ASCII code point or the empty string. maybe Option<char>?
2929
regex_syntax: RegexSyntax,
3030
}

0 commit comments

Comments
 (0)