Skip to content

Commit

Permalink
(fix) unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
alexey-pelykh committed Jun 22, 2024
1 parent 20289a2 commit e803c54
Show file tree
Hide file tree
Showing 8 changed files with 308 additions and 111 deletions.
21 changes: 13 additions & 8 deletions jna/src/main/java/org/pcre4j/jna/Pcre2.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import java.lang.reflect.Method;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Map;

/**
Expand Down Expand Up @@ -102,9 +103,11 @@ public long compile(String pattern, int options, int[] errorcode, long[] errorof
IntByReference errorCodeRef = new IntByReference();
LongByReference errorOffsetRef = new LongByReference();

final var pszPattern = pattern.getBytes(StandardCharsets.UTF_8);

Pointer code = library.pcre2_compile(
pattern,
pattern.length(),
pszPattern,
pszPattern.length,
options,
errorCodeRef,
errorOffsetRef,
Expand Down Expand Up @@ -193,10 +196,12 @@ public void matchContextFree(long mcontext) {

@Override
public int match(long code, String subject, int startoffset, int options, long matchData, long mcontext) {
final var pszSubject = subject.getBytes(StandardCharsets.UTF_8);

return library.pcre2_match(
new Pointer(code),
subject,
subject.length(),
pszSubject,
pszSubject.length,
startoffset,
options,
new Pointer(matchData),
Expand Down Expand Up @@ -229,8 +234,8 @@ private interface Library extends com.sun.jna.Library {
void pcre2_compile_context_free(Pointer ccontext);

Pointer pcre2_compile(
String pattern,
long patternLength,
byte[] pattern,
long length,
int options,
IntByReference errorcode,
LongByReference erroroffset,
Expand All @@ -257,9 +262,9 @@ Pointer pcre2_compile(

int pcre2_match(
Pointer code,
String subject,
byte[] subject,
long length,
long startOffset,
long startoffset,
int options,
Pointer matchData,
Pointer mcontext
Expand Down
2 changes: 1 addition & 1 deletion lib/src/main/java/org/pcre4j/Pcre2CompileError.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public Pcre2CompileError(String pattern, long offset, String message) {
* @param cause the cause of the error
*/
public Pcre2CompileError(String pattern, long offset, String message, Throwable cause) {
super("Error in pattern at %d (%s): %s".formatted(offset, getPatternRegion(pattern, offset), message), cause);
super("Error in pattern at %d \"%s\": %s".formatted(offset, getPatternRegion(pattern, offset), message), cause);
this.pattern = pattern;
this.offset = offset;
this.message = message;
Expand Down
29 changes: 6 additions & 23 deletions lib/src/main/java/org/pcre4j/Pcre2MatchData.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,34 +109,17 @@ public int ovectorCount() {
}

/**
* Get the output vector offset pairs
* Get the output vector composed of offset pairs, each offset pair represents the start and end of the match. The
* value of the offset is the index of the byte where the character starts, not the charcater index.
*
* @return the output vector offset pairs
* @return the output vector
*/
public OffsetPair[] ovector() {
final var count = ovectorCount();
final var offsets = new long[count * 2];
api.getOvector(handle, offsets);

final var ovector = new OffsetPair[count];
for (int pairIndex = 0; pairIndex < count; pairIndex++) {
ovector[pairIndex] = new OffsetPair(
(int) offsets[pairIndex * 2],
(int) offsets[pairIndex * 2 + 1]
);
}
public long[] ovector() {
final var ovector = new long[api.getOvectorCount(handle) * 2];
api.getOvector(handle, ovector);
return ovector;
}

/**
* The output vector offset pair
*
* @param start the start offset in the subject string
* @param end the end offset in the subject string
*/
public record OffsetPair(int start, int end) {
}

private record Clean(IPcre2 api, long matchData) implements Runnable {
@Override
public void run() {
Expand Down
128 changes: 116 additions & 12 deletions lib/src/main/java/org/pcre4j/Pcre4jUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,42 @@ public static String[] getGroupNames(Pcre2Code code) {
* {@code null}
*/
public static String[] getMatchGroups(Pcre2Code code, String subject, Pcre2MatchData matchData) {
if (matchData == null) {
throw new IllegalArgumentException("matchData must not be null");
}

return getMatchGroups(code, subject, matchData.ovector());
}

/**
* Get the match groups
*
* @param code the compiled pattern the match was performed with
* @param subject the subject string the match was performed against
* @param ovector an array of offset pairs corresponding to the match results
* @return an array of strings where the index is the group number and the value is the matched group or
* {@code null}
*/
public static String[] getMatchGroups(Pcre2Code code, String subject, long[] ovector) {
if (code == null) {
throw new IllegalArgumentException("code must not be null");
}
if (subject == null) {
throw new IllegalArgumentException("subject must not be null");
}
if (matchData == null) {
throw new IllegalArgumentException("matchData must not be null");
if (ovector == null) {
throw new IllegalArgumentException("ovector must not be null");
}

final var ovector = matchData.ovector();
final var matchGroups = new String[ovector.length];
for (var matchIndex = 0; matchIndex < ovector.length; matchIndex++) {
final var match = ovector[matchIndex];
matchGroups[matchIndex] = subject.substring(match.start(), match.end());
final var stringIndices = convertOvectorToStringIndices(subject, ovector);

final var matchGroupsCount = ovector.length / 2;
final var matchGroups = new String[matchGroupsCount];
for (var matchIndex = 0; matchIndex < matchGroupsCount; matchIndex++) {
matchGroups[matchIndex] = subject.substring(
stringIndices[matchIndex * 2],
stringIndices[matchIndex * 2 + 1]
);
}
return matchGroups;
}
Expand All @@ -113,26 +134,109 @@ public static String[] getMatchGroups(Pcre2Code code, String subject, Pcre2Match
* @return a map of group names to the matched group or {@code null}
*/
public static Map<String, String> getNamedMatchGroups(Pcre2Code code, String subject, Pcre2MatchData matchData) {
if (matchData == null) {
throw new IllegalArgumentException("matchData must not be null");
}

return getNamedMatchGroups(code, subject, matchData.ovector());
}

/**
* Get the match named groups
*
* @param code the compiled pattern the match was performed with
* @param subject the subject string the match was performed against
* @param ovector an array of offset pairs corresponding to the match results
* @return a map of group names to the matched group or {@code null}
*/
public static Map<String, String> getNamedMatchGroups(Pcre2Code code, String subject, long[] ovector) {
if (code == null) {
throw new IllegalArgumentException("code must not be null");
}
if (subject == null) {
throw new IllegalArgumentException("subject must not be null");
}
if (matchData == null) {
throw new IllegalArgumentException("matchData must not be null");
if (ovector == null) {
throw new IllegalArgumentException("ovector must not be null");
}

final var stringIndices = convertOvectorToStringIndices(subject, ovector);

final var groupNames = getGroupNames(code);
final var ovector = matchData.ovector();
final var matchGroups = new HashMap<String, String>();
for (var matchIndex = 1; matchIndex < ovector.length; matchIndex++) {
final var match = ovector[matchIndex];
final var groupName = groupNames[matchIndex - 1];
if (groupName != null) {
matchGroups.put(groupName, subject.substring(match.start(), match.end()));
matchGroups.put(groupName, subject.substring(
stringIndices[matchIndex * 2],
stringIndices[matchIndex * 2 + 1]
));
}
}
return matchGroups;
}

/**
* Convert the byte-based ovector offset pairs to string index pairs
*
* @param subject the string to which the ovector values correspond
* @param ovector the byte-based ovector offset pairs
* @return a string index pairs
*/
public static int[] convertOvectorToStringIndices(String subject, long[] ovector) {
if (subject == null) {
throw new IllegalArgumentException("subject must not be null");
}

return convertOvectorToStringIndices(subject.getBytes(StandardCharsets.UTF_8), ovector);
}

/**
* Convert the byte-based ovector offset pairs to string index pairs
*
* @param subject the UTF-8 bytes of the string to which the ovector values correspond
* @param ovector the byte-based ovector offset pairs
* @return a string index pairs
*/
public static int[] convertOvectorToStringIndices(byte[] subject, long[] ovector) {
if (subject == null) {
throw new IllegalArgumentException("subject must not be null");
}
if (ovector == null) {
throw new IllegalArgumentException("ovector must not be null");
}
if (ovector.length < 2) {
throw new IllegalArgumentException("ovector must have at least 2 elements");
}
if (ovector.length % 2 != 0) {
throw new IllegalArgumentException("ovector must have an even number of elements");
}
if (ovector[0] > ovector[1]) {
throw new IllegalArgumentException("ovector start must be less than or equal to ovector end");
}

// Match regiob size in bytes is determined by the first offset pair in the ovector
final var matchSizeInBytes = ovector[1] - ovector[0];

// Calculate the mapping of byte offsets to string indices for the relevant subject region of the match
var stringIndex = 0;
final var byteOffsetToStringIndex = new int[(int) matchSizeInBytes + 1];
for (var byteIndex = 0; byteIndex < ovector[1]; byteIndex++) {
if (byteIndex >= ovector[0]) {
byteOffsetToStringIndex[(int) (byteIndex - ovector[0])] = stringIndex;
}
if ((subject[byteIndex] & 0xC0) != 0x80) {
stringIndex++;
}
}
byteOffsetToStringIndex[(int) matchSizeInBytes] = stringIndex;

// Convert byte offsets to string indices
final var stringIndices = new int[ovector.length];
for (var valueIndex = 0; valueIndex < ovector.length; valueIndex++) {
stringIndices[valueIndex] = byteOffsetToStringIndex[(int) (ovector[valueIndex] - ovector[0])];
}

return stringIndices;
}
}
Loading

0 comments on commit e803c54

Please sign in to comment.