Skip to content

Commit

Permalink
cherry-pick TIKA-4381
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Feb 25, 2025
1 parent aba99e2 commit 9268a67
Show file tree
Hide file tree
Showing 12 changed files with 2,309 additions and 215 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Release 3.2.0 - ???

* Improve extraction of properties from msg files (TIKA_4381).

Release 3.1.0 - 01/28/25

* Allow users to turn off the injection of some headers into the content stream of MSG
Expand Down
79 changes: 79 additions & 0 deletions tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata;

/**
*
* Properties that typically appear in MSG/PST message format files.
*
* @since Apache Tika 4.0
*/
public interface MAPI {

String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;

/**
* MAPI message class. What type of .msg/MAPI file is it?
* This is normalized via "mapi_message_classes.properties
*/
Property MESSAGE_CLASS = Property.internalText(PREFIX_MAPI_META + "message-class");

/**
* MAPI message class. What type of .msg/MAPI file is it?
* This is the raw value that is retrieved from the underlying chunk
*/
Property MESSAGE_CLASS_RAW = Property.internalText(PREFIX_MAPI_META + "message-class-raw");

Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type");

Property FROM_REPRESENTING_NAME = Property.internalText(PREFIX_MAPI_META + "from-representing-name");

Property FROM_REPRESENTING_EMAIL = Property.internalText(PREFIX_MAPI_META + "from-representing-email");

Property SUBMISSION_ACCEPTED_AT_TIME = Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time");

Property SUBMISSION_ID = Property.internalText(PREFIX_MAPI_META + "msg-submission-id");

Property INTERNET_MESSAGE_ID = Property.internalText(PREFIX_MAPI_META + "internet-message-id");

Property INTERNET_REFERENCES = Property.internalTextBag(PREFIX_MAPI_META + "internet-references");


Property CONVERSATION_TOPIC = Property.internalText(PREFIX_MAPI_META + "conversation-topic");

Property CONVERSATION_INDEX = Property.internalText(PREFIX_MAPI_META + "conversation-index");
Property IN_REPLY_TO_ID = Property.internalText(PREFIX_MAPI_META + "in-reply-to-id");

Property RECIPIENTS_STRING = Property.internalText(PREFIX_MAPI_META + "recipients-string");
Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META + "importance");
Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority");
Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged");

Property ATTACH_LONG_PATH_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name");
Property ATTACH_LONG_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name");
Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "file-name");
Property ATTACH_CONTENT_ID = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-id");
Property ATTACH_CONTENT_LOCATION = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-location");
Property ATTACH_DISPLAY_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "display-name");
Property ATTACH_EXTENSION = Property.internalText(PREFIX_MAPI_ATTACH_META + "extension");
Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + "mime");
Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + "language");

}

Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler x
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
XHTMLContentHandler xhtml, boolean outputHtml)
throws IOException, SAXException, TikaException {
handleEmbeddedOfficeDoc(dir, new Metadata(), resourceName, xhtml, outputHtml);
}
/**
* Handle an office document that's embedded at the POIFS level
*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, Metadata metadata,
String resourceName, XHTMLContentHandler xhtml, boolean outputHtml)
throws IOException, SAXException, TikaException {


// Is it an embedded OLE2 document, or an embedded OOXML document?
Expand All @@ -165,7 +173,6 @@ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,

if (ooxml != null) {
// It's OOXML (has a ZipFile):
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_LENGTH,
Integer.toString(((DocumentEntry)ooxml).getSize()));
try (TikaInputStream stream = TikaInputStream
Expand All @@ -191,7 +198,6 @@ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
// It's regular OLE2:

// What kind of document is it?
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public class OfficeParserConfig implements Serializable {
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore

private boolean extractExtendedMsgProperties = false;

/**
* @return whether or not to extract macros
*/
Expand Down Expand Up @@ -292,6 +294,14 @@ public void setMaxOverride(int maxOverride) {
public int getMaxOverride() {
return this.maxOverride;
}

public boolean isExtractExtendedMsgProperties() {
return extractExtendedMsgProperties;
}

public void setExtractExtendedMsgProperties(boolean extractExtendedMsgProperties) {
this.extractExtendedMsgProperties = extractExtendedMsgProperties;
}
}


Loading

0 comments on commit 9268a67

Please sign in to comment.