Skip to content

Commit

Permalink
TIKA-4389 cleanups for TIKA-4381 (#2144)
Browse files Browse the repository at this point in the history
(cherry picked from commit 5737f09)
  • Loading branch information
tballison committed Feb 26, 2025
1 parent 0b58752 commit c63985a
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 76 deletions.
2 changes: 1 addition & 1 deletion tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public interface MAPI {

String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;

/**
* MAPI message class. What type of .msg/MAPI file is it?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@

import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.microsoft.OutlookExtractor;
import org.apache.tika.utils.StringUtils;

/**
Expand All @@ -59,60 +58,23 @@ public class ExtendedMetadataExtractor {
loadProperties();
}


private static List<Types.MAPIType> parseDataTypes(String[] arr) {
if (arr.length == 1) {
Types.MAPIType type = parseDataType(arr[0]);
if (type != null) {
return List.of(type);
}
return Collections.EMPTY_LIST;
}
List<Types.MAPIType> types = new ArrayList<>();
for (String s : arr) {
Types.MAPIType type = parseDataType(s);
if (type != null) {
types.add(type);
}
}
return types;
}

private static Types.MAPIType parseDataType(String s) {
if (StringUtils.isBlank(s)) {
return null;
}
String[] parts = s.split(", ");
if (parts.length != 2) {
throw new IllegalArgumentException("expected two parts: " + s);
}
String num = parts[1];
if (num.startsWith("0x")) {
num = num.substring(2);
public static void extract(MAPIMessage msg, Metadata metadata) {
if (msg.getNameIdChunks() == null) {
return;
}
int id = Integer.parseInt(num, 16);
Types.MAPIType type = Types.getById(id);
if (type == null) {
//TODO:
/*
PtypRestriction, 0x00FD
PtypRuleAction, 0x00FE
PtypServerId, 0x00FB
*/
return Types.createCustom(id);
if (msg.getMainChunks() == null || msg.getMainChunks().getRawProperties() == null) {
return;
}
return type;
}


public static void extract(MAPIMessage msg, Metadata metadata) {
//prep our custom nameIdchunk handler
//prep our custom nameIdChunk handler
TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks();
//short-circuit for files that have an empty nameIdChunk
long len = 0;
for (Chunk chunk : msg
.getNameIdChunks()
.getAll()) {
if (chunk == null) {
continue;
}
tikaNameIdChunks.record(chunk);
if (chunk instanceof ByteChunk) {
byte[] value = ((ByteChunk)chunk).getValue();
Expand All @@ -124,14 +86,21 @@ public static void extract(MAPIMessage msg, Metadata metadata) {
if (len == 0) {
return;
}
tikaNameIdChunks.chunksComplete();
try {
tikaNameIdChunks.chunksComplete();
} catch (IllegalStateException e) {
LOGGER.warn("bad namechunks stream", e);
}
for (Map.Entry<MAPIProperty, PropertyValue> e : msg
.getMainChunks()
.getRawProperties()
.entrySet()) {
//the mapiproperties from POI are the literal storage id for that particular file.
//Those storage ids must be mapped via the name chunk ids into a known id
PropertyValue v = e.getValue();
if (v == null) {
continue;
}
List<MAPITag> mapiTags = tikaNameIdChunks.getTags(e.getKey().id);
MAPITagPair pair = null;
for (MAPITag mapiTag : mapiTags) {
Expand All @@ -146,7 +115,6 @@ public static void extract(MAPIMessage msg, Metadata metadata) {
}
updateMetadata(pair, v, metadata);
}

}


Expand Down Expand Up @@ -180,7 +148,7 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue
if (!includeType(propertyValue)) {
return;
}
String key = MAPI.PREFIX_MAPI_RAW_META + pair.tikaMapiProperty.name;
String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name;
Types.MAPIType type = propertyValue.getActualType();
if (type == Types.TIME || type == Types.MV_TIME || type == Types.APP_TIME || type == Types.MV_APP_TIME) {
Calendar calendar = (Calendar) propertyValue.getValue();
Expand All @@ -190,8 +158,12 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue
.toString();
metadata.add(key, calendarString);
} else if (type == Types.BOOLEAN) {
metadata.add(key, Boolean.toString((boolean) propertyValue.getValue()));
} else {
Boolean val = (Boolean)propertyValue.getValue();
if (val == null) {
return;
}
metadata.add(key, Boolean.toString(val));
} else if (! StringUtils.isBlank(propertyValue.toString())) {
metadata.add(key, propertyValue.toString());
}

Expand All @@ -205,11 +177,6 @@ private static boolean includeType(PropertyValue propertyValue) {
return true;
}

private static boolean isString(PropertyValue propertyValue) {
Types.MAPIType mapiType = propertyValue.getActualType();
return mapiType == Types.ASCII_STRING || mapiType == Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == Types.UNICODE_STRING;
}

private static class TikaMapiProperty {
String name;
ClassID classID; // can be null
Expand Down Expand Up @@ -237,7 +204,7 @@ private static void loadProperties() {
.toUUIDString(), setType.getClassID());
}
try (BufferedReader r = new BufferedReader(
new InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) {
new InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) {
String line = r.readLine();
while (line != null) {
if (line.isBlank() || line.startsWith("#")) {
Expand Down Expand Up @@ -309,4 +276,50 @@ public MAPITagPair(MAPITag mapiTag, TikaMapiProperty tikaMapiProperty) {
this.tikaMapiProperty = tikaMapiProperty;
}
}


private static List<Types.MAPIType> parseDataTypes(String[] arr) {
if (arr.length == 1) {
Types.MAPIType type = parseDataType(arr[0]);
if (type != null) {
return List.of(type);
}
return Collections.EMPTY_LIST;
}
List<Types.MAPIType> types = new ArrayList<>();
for (String s : arr) {
Types.MAPIType type = parseDataType(s);
if (type != null) {
types.add(type);
}
}
return types;
}

private static Types.MAPIType parseDataType(String s) {
if (StringUtils.isBlank(s)) {
return null;
}
String[] parts = s.split(", ");
if (parts.length != 2) {
throw new IllegalArgumentException("expected two parts: " + s);
}
String num = parts[1];
if (num.startsWith("0x")) {
num = num.substring(2);
}
int id = Integer.parseInt(num, 16);
Types.MAPIType type = Types.getById(id);
if (type == null) {
//TODO:
/*
PtypRestriction, 0x00FD
PtypRuleAction, 0x00FE
PtypServerId, 0x00FB
*/
return Types.createCustom(id);
}
return type;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more
package org.apache.tika.parser.microsoft.msg;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -132,10 +133,11 @@ public void chunksComplete() {
loadTags();
}

//does not return null
public List<MAPITag> getTags(int storageId) {
List<MAPITag> tags = mapiTagMap.get(storageId);
if (tags == null) {
return new ArrayList<>();
return Collections.emptyList();
}
return tags;
}
Expand Down Expand Up @@ -235,7 +237,7 @@ private long getPropertyTag(long streamID, long nameOffset, long propertyNameCRC
return 0;
}
for (Chunk chunk : chunks) {
if (chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) {
if (chunk == null || chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) {
continue;
}
byte[] matchChunkBytes = ((ByteChunk) chunk).getValue();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,17 +317,17 @@ public void testAppointmentExtendedMetadata() throws Exception {

List<Metadata> metadataList = getRecursiveMetadata("testMSG_Appointment.msg", parseContext);
Metadata m = metadataList.get(0);
assertTrue(m.get("mapi:raw:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
assertTrue(m.get("mapi:raw:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
assertTrue(m.get("mapi:raw:PidLidClipStart").contains("2017-02-28T18"));
assertTrue(m.get("mapi:raw:PidLidClipEnd").contains("2017-02-28T19"));
assertTrue(m.get("mapi:raw:PidLidCommonStart").contains("2017-02-28T18"));
assertTrue(m.get("mapi:raw:PidLidCommonEnd").contains("2017-02-28T19"));
assertTrue(m.get("mapi:raw:PidLidReminderSignalTime").contains("4501-01-01T00"));
assertTrue(m.get("mapi:raw:PidLidReminderTime").contains("2017-02-28T18"));
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
assertEquals("0", m.get("mapi:raw:PidLidAppointmentSequence"));
assertEquals("false", m.get("mapi:raw:PidLidRecurring"));
assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18"));
assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19"));
assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18"));
assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19"));
assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00"));
assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18"));
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
assertEquals("false", m.get("mapi:property:PidLidRecurring"));
}

@Test
Expand All @@ -338,12 +338,28 @@ public void testTaskExtendedMetadata() throws Exception {
parseContext.set(OfficeParserConfig.class, officeParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Task.msg", parseContext);
Metadata m = metadataList.get(0);
assertTrue(m.get("mapi:raw:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
assertEquals("0", m.get("mapi:raw:PidLidTaskActualEffort"));
assertEquals("false", m.get("mapi:raw:PidLidTeamTask"));
assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort"));
assertEquals("false", m.get("mapi:property:PidLidTeamTask"));
}

@Test
public void testContactExtendedMetadata() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Contact.msg");
Metadata m = metadataList.get(0);
assertEquals("2017-02-28T18:41:37Z", m.get("mapi:property:PidLidValidFlagStringProof"));
}


@Test
public void testPostExtendedMetadata() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Post.msg");
Metadata m = metadataList.get(0);
assertEquals("2017-02-28T18:47:11Z", m.get("mapi:property:PidLidValidFlagStringProof"));
}


@Test
public void testHandlingAllAlternativesBodies() throws Exception {
//test that default only has one body
Expand Down

0 comments on commit c63985a

Please sign in to comment.