diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index 45fbad1bc3..5f4ef12ae5 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -26,7 +26,7 @@ public interface MAPI { String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** * MAPI message class. What type of .msg/MAPI file is it? diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java index 6d4880c517..877bd796e5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java @@ -42,7 +42,6 @@ import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.microsoft.OutlookExtractor; import org.apache.tika.utils.StringUtils; /** @@ -59,60 +58,23 @@ public class ExtendedMetadataExtractor { loadProperties(); } - - private static List parseDataTypes(String[] arr) { - if (arr.length == 1) { - Types.MAPIType type = parseDataType(arr[0]); - if (type != null) { - return List.of(type); - } - return Collections.EMPTY_LIST; - } - List types = new ArrayList<>(); - for (String s : arr) { - Types.MAPIType type = parseDataType(s); - if (type != null) { - types.add(type); - } - } - return types; - } - - private static Types.MAPIType parseDataType(String s) { - if (StringUtils.isBlank(s)) { - return null; - } - String[] parts = s.split(", "); - if (parts.length != 2) { - throw new IllegalArgumentException("expected two parts: " + s); - } - String num = parts[1]; - if (num.startsWith("0x")) { - num = num.substring(2); + public static void extract(MAPIMessage msg, Metadata metadata) { + if (msg.getNameIdChunks() == null) { + return; } - int id = Integer.parseInt(num, 16); - Types.MAPIType type = Types.getById(id); - if (type == null) { - //TODO: - /* - PtypRestriction, 0x00FD - PtypRuleAction, 0x00FE - PtypServerId, 0x00FB - */ - return Types.createCustom(id); + if (msg.getMainChunks() == null || msg.getMainChunks().getRawProperties() == null) { + return; } - return type; - } - - - public static void extract(MAPIMessage msg, Metadata metadata) { - //prep our custom nameIdchunk handler + //prep our custom nameIdChunk handler TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks(); //short-circuit for files that have an empty nameIdChunk long len = 0; for (Chunk chunk : msg .getNameIdChunks() .getAll()) { + if (chunk == null) { + continue; + } tikaNameIdChunks.record(chunk); if (chunk instanceof ByteChunk) { byte[] value = ((ByteChunk)chunk).getValue(); @@ -124,7 +86,11 @@ public static void extract(MAPIMessage msg, Metadata metadata) { if (len == 0) { return; } - tikaNameIdChunks.chunksComplete(); + try { + tikaNameIdChunks.chunksComplete(); + } catch (IllegalStateException e) { + LOGGER.warn("bad namechunks stream", e); + } for (Map.Entry e : msg .getMainChunks() .getRawProperties() @@ -132,6 +98,9 @@ public static void extract(MAPIMessage msg, Metadata metadata) { //the mapiproperties from POI are the literal storage id for that particular file. //Those storage ids must be mapped via the name chunk ids into a known id PropertyValue v = e.getValue(); + if (v == null) { + continue; + } List mapiTags = tikaNameIdChunks.getTags(e.getKey().id); MAPITagPair pair = null; for (MAPITag mapiTag : mapiTags) { @@ -146,7 +115,6 @@ public static void extract(MAPIMessage msg, Metadata metadata) { } updateMetadata(pair, v, metadata); } - } @@ -180,7 +148,7 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue if (!includeType(propertyValue)) { return; } - String key = MAPI.PREFIX_MAPI_RAW_META + pair.tikaMapiProperty.name; + String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name; Types.MAPIType type = propertyValue.getActualType(); if (type == Types.TIME || type == Types.MV_TIME || type == Types.APP_TIME || type == Types.MV_APP_TIME) { Calendar calendar = (Calendar) propertyValue.getValue(); @@ -190,8 +158,12 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue .toString(); metadata.add(key, calendarString); } else if (type == Types.BOOLEAN) { - metadata.add(key, Boolean.toString((boolean) propertyValue.getValue())); - } else { + Boolean val = (Boolean)propertyValue.getValue(); + if (val == null) { + return; + } + metadata.add(key, Boolean.toString(val)); + } else if (! StringUtils.isBlank(propertyValue.toString())) { metadata.add(key, propertyValue.toString()); } @@ -205,11 +177,6 @@ private static boolean includeType(PropertyValue propertyValue) { return true; } - private static boolean isString(PropertyValue propertyValue) { - Types.MAPIType mapiType = propertyValue.getActualType(); - return mapiType == Types.ASCII_STRING || mapiType == Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == Types.UNICODE_STRING; - } - private static class TikaMapiProperty { String name; ClassID classID; // can be null @@ -237,7 +204,7 @@ private static void loadProperties() { .toUUIDString(), setType.getClassID()); } try (BufferedReader r = new BufferedReader( - new InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) { + new InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) { String line = r.readLine(); while (line != null) { if (line.isBlank() || line.startsWith("#")) { @@ -309,4 +276,50 @@ public MAPITagPair(MAPITag mapiTag, TikaMapiProperty tikaMapiProperty) { this.tikaMapiProperty = tikaMapiProperty; } } + + + private static List parseDataTypes(String[] arr) { + if (arr.length == 1) { + Types.MAPIType type = parseDataType(arr[0]); + if (type != null) { + return List.of(type); + } + return Collections.EMPTY_LIST; + } + List types = new ArrayList<>(); + for (String s : arr) { + Types.MAPIType type = parseDataType(s); + if (type != null) { + types.add(type); + } + } + return types; + } + + private static Types.MAPIType parseDataType(String s) { + if (StringUtils.isBlank(s)) { + return null; + } + String[] parts = s.split(", "); + if (parts.length != 2) { + throw new IllegalArgumentException("expected two parts: " + s); + } + String num = parts[1]; + if (num.startsWith("0x")) { + num = num.substring(2); + } + int id = Integer.parseInt(num, 16); + Types.MAPIType type = Types.getById(id); + if (type == null) { + //TODO: + /* + PtypRestriction, 0x00FD + PtypRuleAction, 0x00FE + PtypServerId, 0x00FB + */ + return Types.createCustom(id); + } + return type; + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java index 54e963ee34..ba54f6e4ec 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java @@ -18,6 +18,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more package org.apache.tika.parser.microsoft.msg; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -132,10 +133,11 @@ public void chunksComplete() { loadTags(); } + //does not return null public List getTags(int storageId) { List tags = mapiTagMap.get(storageId); if (tags == null) { - return new ArrayList<>(); + return Collections.emptyList(); } return tags; } @@ -235,7 +237,7 @@ private long getPropertyTag(long streamID, long nameOffset, long propertyNameCRC return 0; } for (Chunk chunk : chunks) { - if (chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) { + if (chunk == null || chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) { continue; } byte[] matchChunkBytes = ((ByteChunk) chunk).getValue(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 17f56885fb..84ec3ccc3e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -317,17 +317,17 @@ public void testAppointmentExtendedMetadata() throws Exception { List metadataList = getRecursiveMetadata("testMSG_Appointment.msg", parseContext); Metadata m = metadataList.get(0); - assertTrue(m.get("mapi:raw:PidLidAppointmentEndWhole").contains("2017-02-28T19")); - assertTrue(m.get("mapi:raw:PidLidAppointmentStartWhole").contains("2017-02-28T18")); - assertTrue(m.get("mapi:raw:PidLidClipStart").contains("2017-02-28T18")); - assertTrue(m.get("mapi:raw:PidLidClipEnd").contains("2017-02-28T19")); - assertTrue(m.get("mapi:raw:PidLidCommonStart").contains("2017-02-28T18")); - assertTrue(m.get("mapi:raw:PidLidCommonEnd").contains("2017-02-28T19")); - assertTrue(m.get("mapi:raw:PidLidReminderSignalTime").contains("4501-01-01T00")); - assertTrue(m.get("mapi:raw:PidLidReminderTime").contains("2017-02-28T18")); - assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:42")); - assertEquals("0", m.get("mapi:raw:PidLidAppointmentSequence")); - assertEquals("false", m.get("mapi:raw:PidLidRecurring")); + assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19")); + assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18")); + assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18")); + assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19")); + assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18")); + assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19")); + assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00")); + assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18")); + assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42")); + assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence")); + assertEquals("false", m.get("mapi:property:PidLidRecurring")); } @Test @@ -338,12 +338,28 @@ public void testTaskExtendedMetadata() throws Exception { parseContext.set(OfficeParserConfig.class, officeParserConfig); List metadataList = getRecursiveMetadata("testMSG_Task.msg", parseContext); Metadata m = metadataList.get(0); - assertTrue(m.get("mapi:raw:PidLidToDoOrdinalDate").contains("2017-02-28T18:44")); - assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:44")); - assertEquals("0", m.get("mapi:raw:PidLidTaskActualEffort")); - assertEquals("false", m.get("mapi:raw:PidLidTeamTask")); + assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44")); + assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44")); + assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort")); + assertEquals("false", m.get("mapi:property:PidLidTeamTask")); } + @Test + public void testContactExtendedMetadata() throws Exception { + List metadataList = getRecursiveMetadata("testMSG_Contact.msg"); + Metadata m = metadataList.get(0); + assertEquals("2017-02-28T18:41:37Z", m.get("mapi:property:PidLidValidFlagStringProof")); + } + + + @Test + public void testPostExtendedMetadata() throws Exception { + List metadataList = getRecursiveMetadata("testMSG_Post.msg"); + Metadata m = metadataList.get(0); + assertEquals("2017-02-28T18:47:11Z", m.get("mapi:property:PidLidValidFlagStringProof")); + } + + @Test public void testHandlingAllAlternativesBodies() throws Exception { //test that default only has one body