Skip to content

Commit 8d1535c

Browse files
committed
Add control token for control character escapes
1 parent 666c029 commit 8d1535c

File tree

3 files changed

+105
-2
lines changed

3 files changed

+105
-2
lines changed

src/expression.ts

+38
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import {
1717
bindAsIncomplete,
1818
captureName,
1919
compareCodePoint,
20+
controlChar,
2021
escapeForCharClass,
2122
flagString,
2223
getLiteralString,
@@ -273,6 +274,15 @@ class RegExpBuilder implements RegExpToken {
273274
return bindAsIncomplete(func, this, 'unicode');
274275
}
275276

277+
public get control(): RegExpToken['control'] {
278+
function func(this: RegExpBuilder, ...args: RegExpLiteral): RegExpToken {
279+
const literal = getLiteralString(args, false);
280+
if (!controlChar.test(literal)) throw new Error('Invalid control character');
281+
return this.addNode(`\\c${literal}`);
282+
}
283+
return bindAsIncomplete(func, this, 'control');
284+
}
285+
276286
public get charIn(): RegExpToken['charIn'] {
277287
function func(
278288
this: RegExpBuilder,
@@ -1244,6 +1254,34 @@ export const hex = r.hex;
12441254
*/
12451255
export const unicode = r.unicode;
12461256

1257+
/**
1258+
* Match a control character with value equal to the given letter's character value modulo 32.
1259+
* Only a letter from `a` to `z` or `A` to `Z` is allowed.
1260+
*
1261+
* For example, `\cJ` represents line break (`\n`), because the code point of `J` is 74, and 74 modulo 32 is 10,
1262+
* which is the code point of line break. Because an uppercase letter and its lowercase form differ by 32,
1263+
* `\cJ` and `\cj` are equivalent. You can represent control characters from 1 to 26 in this form.
1264+
*
1265+
* ( https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#description )
1266+
*
1267+
* @example
1268+
*
1269+
* ```js
1270+
* control`j`
1271+
* control('j')
1272+
* control`J`
1273+
* control('J')
1274+
* ```
1275+
*
1276+
* RegExp equivalent:
1277+
*
1278+
* ```js
1279+
* /\cj/
1280+
* /\cJ/
1281+
* ```
1282+
*/
1283+
export const control = r.control;
1284+
12471285
/**
12481286
* Match a character listed in the group. A hyphen denotes a range of characters, such as `a-z`.
12491287
*

src/helper.ts

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ export const hexNumber = /^[0-9a-fA-F]+$/;
88

99
export const octalNumber = /^[0-7]+$/;
1010

11+
export const controlChar = /^[a-zA-Z]$/;
12+
1113
// octal escape sequences are not matched here because they should be wrapped in a character class
12-
export const negatableCharLiteral = /^(?:\\u[0-9a-fA-F]{4}|\\x[0-9a-fA-F]{2})$/;
14+
export const negatableCharLiteral = /^(?:\\u[0-9a-fA-F]{4}|\\x[0-9a-fA-F]{2}|\\c[a-zA-Z])$/;
1315

1416
// last option refers to octal character or capture group backreference
15-
export const charLiteral = /^(?:\\u[0-9a-fA-F]{4}|\\x[0-9a-fA-F]{2}|\\\d{1,3})$/;
17+
export const charLiteral = /^(?:\\u[0-9a-fA-F]{4}|\\x[0-9a-fA-F]{2}|\\c[a-zA-Z]|\\\d{1,3})$/;
1618

1719
export const captureName = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
1820

src/types.ts

+63
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,41 @@ export interface LiteralFunction {
99
(template: TemplateStringsArray, ...args: unknown[]): RegExpToken;
1010
}
1111

12+
type Lowercase =
13+
| 'a'
14+
| 'b'
15+
| 'c'
16+
| 'd'
17+
| 'e'
18+
| 'f'
19+
| 'g'
20+
| 'h'
21+
| 'i'
22+
| 'j'
23+
| 'k'
24+
| 'l'
25+
| 'm'
26+
| 'n'
27+
| 'o'
28+
| 'p'
29+
| 'q'
30+
| 'r'
31+
| 's'
32+
| 't'
33+
| 'u'
34+
| 'v'
35+
| 'w'
36+
| 'x'
37+
| 'y'
38+
| 'z';
39+
40+
type Alphabet = Lowercase | Uppercase<Lowercase>;
41+
42+
export interface ControlFunction {
43+
(literal: Alphabet): RegExpToken;
44+
(template: TemplateStringsArray, ...args: unknown[]): RegExpToken;
45+
}
46+
1247
/**
1348
* A function with flexible parameters and return type.
1449
* This is used by custom tokens.
@@ -777,6 +812,34 @@ export interface RegExpToken {
777812
*/
778813
get unicode(): LiteralFunction & IncompleteToken;
779814

815+
/**
816+
* Match a control character with value equal to the given letter's character value modulo 32.
817+
* Only a letter from `a` to `z` or `A` to `Z` is allowed.
818+
*
819+
* For example, `\cJ` represents line break (`\n`), because the code point of `J` is 74, and 74 modulo 32 is 10,
820+
* which is the code point of line break. Because an uppercase letter and its lowercase form differ by 32,
821+
* `\cJ` and `\cj` are equivalent. You can represent control characters from 1 to 26 in this form.
822+
*
823+
* ( https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#description )
824+
*
825+
* @example
826+
*
827+
* ```js
828+
* control`j`
829+
* control('j')
830+
* control`J`
831+
* control('J')
832+
* ```
833+
*
834+
* RegExp equivalent:
835+
*
836+
* ```js
837+
* /\cj/
838+
* /\cJ/
839+
* ```
840+
*/
841+
get control(): ControlFunction & IncompleteToken;
842+
780843
/**
781844
* Match a character listed in the group. A hyphen denotes a range of characters, such as `a-z`.
782845
*

0 commit comments

Comments
 (0)