Skip to content

Commit f217270

Browse files
Enhance tika document parsing tests (opensearch-project#13618)
* Update tika document parsing bwc tests. Signed-off-by: Carroll <carrofin@amazon.com> * Skip sample tika files which do not parse consistently. Signed-off-by: Carroll <carrofin@amazon.com> * Formatting for spotlessJavaCheck. Signed-off-by: Carroll <carrofin@amazon.com> * Use fixed locale for consistent tika parsing. Signed-off-by: Carroll <carrofin@amazon.com> * Move sha1 map to .checksums file. Signed-off-by: Carroll <carrofin@amazon.com> * For locale dependant files do not verify contents with hash. Signed-off-by: Carroll <carrofin@amazon.com> * Remove strict checksum validation for additional locale dependant files. Signed-off-by: Carroll <carrofin@amazon.com> --------- Signed-off-by: Carroll <carrofin@amazon.com>
1 parent da3ab92 commit f217270

File tree

2 files changed

+248
-26
lines changed
  • plugins/ingest-attachment/src/test

2 files changed

+248
-26
lines changed

plugins/ingest-attachment/src/test/java/org/opensearch/ingest/attachment/TikaDocTests.java

+39-26
Original file line numberDiff line numberDiff line change
@@ -32,54 +32,67 @@
3232

3333
package org.opensearch.ingest.attachment;
3434

35+
import org.apache.commons.codec.digest.DigestUtils;
3536
import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems;
3637
import org.apache.lucene.tests.util.TestUtil;
3738
import org.apache.tika.metadata.Metadata;
3839
import org.opensearch.common.io.PathUtils;
40+
import org.opensearch.common.xcontent.XContentHelper;
41+
import org.opensearch.common.xcontent.json.JsonXContent;
3942
import org.opensearch.test.OpenSearchTestCase;
4043

4144
import java.nio.file.DirectoryStream;
4245
import java.nio.file.Files;
4346
import java.nio.file.Path;
47+
import java.util.Map;
4448

4549
/**
46-
* Evil test-coverage cheat, we parse a bunch of docs from tika
47-
* so that we have a nice grab-bag variety, and assert some content
48-
* comes back and no exception.
50+
* Parse sample tika documents and assert the contents has not changed according to previously recorded checksums.
51+
* Uncaught changes to tika parsing could potentially pose bwc issues.
52+
* Note: In some cases tika will access a user's locale to inform the parsing of a file.
53+
* The checksums of these files are left empty, and we only validate that parsed content is not null.
4954
*/
5055
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
5156
public class TikaDocTests extends OpenSearchTestCase {
5257

53-
/** some test files from tika test suite, zipped up */
58+
/** some test files from the apache tika unit test suite with accompanying sha1 checksums */
5459
static final String TIKA_FILES = "/org/opensearch/ingest/attachment/test/tika-files/";
60+
static final String TIKA_CHECKSUMS = "/org/opensearch/ingest/attachment/test/.checksums";
5561

56-
public void testFiles() throws Exception {
57-
Path tmp = createTempDir();
58-
logger.debug("unzipping all tika sample files");
59-
try (DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(TIKA_FILES).toURI()))) {
60-
for (Path doc : stream) {
61-
String filename = doc.getFileName().toString();
62-
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES + filename), tmp);
63-
}
64-
}
62+
public void testParseSamples() throws Exception {
63+
String checksumJson = Files.readString(PathUtils.get(getClass().getResource(TIKA_CHECKSUMS).toURI()));
64+
Map<String, Object> checksums = XContentHelper.convertToMap(JsonXContent.jsonXContent, checksumJson, false);
65+
DirectoryStream<Path> stream = Files.newDirectoryStream(unzipToTemp(TIKA_FILES));
6566

66-
try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
67-
for (Path doc : stream) {
68-
logger.debug("parsing: {}", doc);
69-
assertParseable(doc);
67+
for (Path doc : stream) {
68+
String parsedContent = tryParse(doc);
69+
assertNotNull(parsedContent);
70+
assertFalse(parsedContent.isEmpty());
71+
72+
String check = checksums.get(doc.getFileName().toString()).toString();
73+
if (!check.isEmpty()) {
74+
assertEquals(check, DigestUtils.sha1Hex(parsedContent));
7075
}
7176
}
77+
78+
stream.close();
7279
}
7380

74-
void assertParseable(Path fileName) throws Exception {
75-
try {
76-
byte bytes[] = Files.readAllBytes(fileName);
77-
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
78-
assertNotNull(parsedContent);
79-
assertFalse(parsedContent.isEmpty());
80-
logger.debug("extracted content: {}", parsedContent);
81-
} catch (Exception e) {
82-
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
81+
private Path unzipToTemp(String zipDir) throws Exception {
82+
Path tmp = createTempDir();
83+
DirectoryStream<Path> stream = Files.newDirectoryStream(PathUtils.get(getClass().getResource(zipDir).toURI()));
84+
85+
for (Path doc : stream) {
86+
String filename = doc.getFileName().toString();
87+
TestUtil.unzip(getClass().getResourceAsStream(zipDir + filename), tmp);
8388
}
89+
90+
stream.close();
91+
return tmp;
92+
}
93+
94+
private String tryParse(Path doc) throws Exception {
95+
byte bytes[] = Files.readAllBytes(doc);
96+
return TikaImpl.parse(bytes, new Metadata(), -1);
8497
}
8598
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
{
2+
"testWORD_tabular_symbol.doc": "c708d7ef841f7e1748436b8ef5670d0b2de1a227",
3+
"testWORD_1img.docx": "367e2ade13ca3c19bcd8a323e21d51d407e017ac",
4+
"testMasterFooter.odp": "bcc59df70699c739423a50e362c722b81ae76498",
5+
"testTXTNonASCIIUTF8.txt": "1ef514431ca8d838f11e99f8e4a0637730b77aa0",
6+
"EmbeddedOutlook.docx": "c544a6765c19ba11b0bf3edb55c79e1bd8565c6e",
7+
"testWORD_override_list_numbering.docx": "4e892319b921322916225def763f451e4bbb4e16",
8+
"testTextBoxes.key": "b01581d5bd2483ce649a1a1406136359f4b93167",
9+
"testPPT_masterText.pptx": "9fee8337b76dc3e196f4554dcde22b9dd1c3b3e8",
10+
"testComment.docx": "333b9009686f27265b4729e8172b3e62048ec7ec",
11+
"testRTFInvalidUnicode.rtf": "32b3e3d8e5c5a1b66cb15fc964b9341bea7048f4",
12+
"testEXCEL_headers_footers.xlsx": "9e8d2a700fc431fe29030e86e08162fc8ecf2c1a",
13+
"testWORD6.doc": "1479de589755c7212815445799c44dab69d4587c",
14+
"testPagesHeadersFootersFootnotes.pages": "99d434be7de4902dc70700aa9c2a31624583c1f1",
15+
"testPDF_no_extract_yes_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
16+
"testOpenOffice2.odt": "564b3e1999a53073a04142e01b663757a6e7fb08",
17+
"testTables.key": "250cff75db7fc3c8b95b2cbd3f37308826e0c93d",
18+
"testDOCX_Thumbnail.docx": "fce6a43271bc242e2bb8341afa659ed166e08050",
19+
"testWORD_3imgs.docx": "292ca6fa41d32b462e66061e89adb19423721975",
20+
"testPDF_acroform3.pdf": "dcf6588cb5e41701b168606ea6bfbadecdcd3bc9",
21+
"testWORD_missing_ooxml_bean1.docx": "c3058f2513fecc0a6d76d3ecf55676f236b085ff",
22+
"testPDFTwoTextBoxes.pdf": "4adf324ce030076b1755fdb3a6cce676ee325ae4",
23+
"testRTFUnicodeGothic.rtf": "f9932470ff686b0c217ea94ed5d4f2fd85f7998e",
24+
"headers.mbox": "75ec25789fe870b6d25365e4ea73d731fc274847",
25+
"testPPT_embeded.ppt": "",
26+
"testXML3.xml": "804d4812408eb324ae8483d2140b648ec871dd2a",
27+
"testOptionalHyphen.doc": "10f9ca38cc2985e94967aa2c454bfe40aff76976",
28+
"testComment.doc": "66e57653d5d08478556ca640408b172b65855cc7",
29+
"testEXCEL_headers_footers.xls": "18977c66fc8bcb8c44de3063b69b65a3de9c3f25",
30+
"testWORD_embedded_rtf.doc": "cc2d289acfe3d1068a2649b7fa0c06c50bb6ceda",
31+
"testEXCEL_custom_props.xlsx": "6b72ae08362a204b37dbba0a30b4134ae3e7918f",
32+
"testOptionalHyphen.docx": "5b8ffc0df1691a8fed7d63aa9b256e9e02e36d71",
33+
"testPPT_various.pptx": "d149de9af8071141a6ba6e2cd4ef5f6d9431a826",
34+
"testWORD_closingSmartQInHyperLink.doc": "9859f378c603b70bf0d44a281169ae5b16a21878",
35+
"test_embedded_zip.pptx": "d19406edcec09440d066877c451ceba60abc3483",
36+
"testRTFUmlautSpaces.rtf": "155b39879c5b5fbad22fd650be37ae7f91489eb2",
37+
"protectedFile.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28",
38+
"Doc1_ole.doc": "fb63220506ab666f1fe87b0608e1447fd4fd3489",
39+
"testEXCEL_embeded.xlsx": "",
40+
"EmbeddedDocument.docx": "",
41+
"testODFwithOOo3.odt": "3815d6fb7f5829db882ea8ebd664f252711e6e60",
42+
"testPagesHeadersFootersRomanUpper.pages": "85b3cd545ba6c33e5d44b844a6afea8cb6eaec0b",
43+
"testPPT_comment.ppt": "88fd667fd0292785395a8d0d229304aa91110556",
44+
"testPPT_2imgs.pptx": "66eda11ad472918153100dad8ee5be0f1f8e2e04",
45+
"testPagesHeadersFootersAlphaUpper.pages": "56bef0d1eaedfd7599aae29031d2eeb0e3fe4688",
46+
"testWORD_text_box.docx": "e01f7b05c6aac3449b9a699c3e4d2e62ff3368a3",
47+
"testWORD_missing_text.docx": "3814332884a090b6d1020bff58d0531486710c45",
48+
"testComment.pdf": "60e181061a00454c2e622bd37a9878234c13231d",
49+
"testPDF_no_extract_no_accessibility_owner_empty.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
50+
"test_embedded_package.rtf": "cd90adb3f777e68aa0288fd23e8f4fbce260a763",
51+
"testPDF_bom.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
52+
"testOptionalHyphen.ppt": "7e016e42860bd408054bb8653fef39b2756119d9",
53+
"testHTML_utf8.html": "3ba828044754772e4c9df5f9a2213beaa75842ef",
54+
"testPPT_comment.pptx": "25fab588194dabd5902fd2ef880ee9542d036776",
55+
"testRTFWithCurlyBraces.rtf": "019cab63b73ff89d094823cf50c0a721bec08ee2",
56+
"testFooter.ods": "846e1d0415b23fa27631b536b0cf566abbf8fcc1",
57+
"testPPT.ppt": "933ee556884b1d9e28b801daa0d77bbaa4f4be62",
58+
"testEXCEL-formats.xls": "",
59+
"testPPT_masterFooter.pptx": "29bb97006b3608b7db6ff72b94d20157878d94dd",
60+
"testWORD_header_hyperlink.doc": "914bbec0730c54948ad307ea3e375ef0c100abf1",
61+
"testRTFHyperlink.rtf": "2b2ffb1997aa495fbab1af490d134051de168c97",
62+
"testExtraSpaces.pdf": "b5575400309b01c1050a927d8d1ecf8761062abc",
63+
"testRTFWindowsCodepage1250.rtf": "7ba418843f401634f97d21c844c2c4093b7194fb",
64+
"testRTFTableCellSeparation2.rtf": "62782ca40ff0ed6c3ba90f8055ee724b44af203f",
65+
"testPagesHeadersFootersRomanLower.pages": "2410fc803907001eb39c201ad4184b243e271c6d",
66+
"headerPic.docx": "c704bb648feac7975dff1024a5f762325be7cbc2",
67+
"testHTMLNoisyMetaEncoding_4.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
68+
"testRTFBoldItalic.rtf": "0475d224078682cf3f9f3f4cbc14a63456c5a0d8",
69+
"test-outlook.msg": "1f202fc11a873e305d5b4d4607409f3f734065ec",
70+
"testRTFVarious.rtf": "bf6ea9cf57886e680c5e6743a66a12b950a09083",
71+
"testXHTML.html": "c6da900f81c1c550518e65d579d3dd62dd7c5c0c",
72+
"EmbeddedPDF.docx": "454476bdf4a968189a6f53e75c146382bf58a434",
73+
"testXML.xml": "e1615e9b31be58f7af9ad963e5a112efa5cdaffa",
74+
"testWORD_no_format.docx": "9a3f5d8a4c8c0f077cc615bcfc554dc87d5926aa",
75+
"testPPT_masterText.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3",
76+
"testPDF_PDFEncodedStringInXMP.pdf": "78fd59d394f72d28a9908739fa562099978dafa1",
77+
"testPPT_custom_props.pptx": "72152d28afbc23a50cc71fa37d1dce9ef03ca72d",
78+
"testRTFListOverride.rtf": "f8c61d8a66afdaa07f3740e859497818bfc2ca01",
79+
"testEXCEL_1img.xls": "",
80+
"testWORD_1img.doc": "0826d299a7770e93603f5667d89dccb7b74d904c",
81+
"testNPEOpenDocument.odt": "4210b973c80084c58463ec637fa43e911f77d6fe",
82+
"testRTFWord2010CzechCharacters.rtf": "9443011aac32434240ab8dbff360c970fc1c7074",
83+
"testPDF_Version.8.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
84+
"testPPT.ppsx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
85+
"testPPT_autodate.pptx": "50467dbb37d1c74b8b37fe93eddf6f9e87d21bf3",
86+
"testWordArt.pptx": "3566bbee790704b3654fe78319957f9e0cddb6d9",
87+
"NullHeader.docx": "18430c968ba29173b52610efdaa723424b3c4d79",
88+
"testRTFWordPadCzechCharacters.rtf": "5dbb58452a3507c384008662f8fce90063f12189",
89+
"resume.html": "fbfb9d8264f6eebd79847fe7a7f1b81edd4a027d",
90+
"testPagesLayout.pages": "5db1ab91c93e6183d0af8513f62c7b87964704af",
91+
"testOptionalHyphen.pptx": "c2977eefe7d2cad8c671f550d7883185ec65591b",
92+
"testWORD_numbered_list.docx": "07194c58165993468e66bc4eba4f5bd89d5bee09",
93+
"testEXCEL_1img.xlsx": "",
94+
"testPDFTripleLangTitle.pdf": "6eb693dac68fece3bf3cd1aa9880ea9b23fc927c",
95+
"protect.xlsx": "ee08eeaf05c35c960243f831c3a974d9ee07aa28",
96+
"testWORD_bold_character_runs2.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
97+
"testXLSX_Thumbnail.xlsx": "020bf155ae157661c11727c54e6694cf9cd2c0d3",
98+
"testWORD_embedded_pdf.docx": "d8adb797aaaac92afd8dd9b499bd197347f15688",
99+
"testOptionalHyphen.rtf": "2f77b61bab5b4502b4ddd5018b454be157091d07",
100+
"testEXCEL-charts.xls": "",
101+
"testWORD_override_list_numbering.doc": "60e47a3e71ba08af20af96131d61740a1f0bafa3",
102+
"testPDF_twoAuthors.pdf": "c5f0296cc21f9ae99ceb649b561c55f99d7d9452",
103+
"testPDF_Version.10.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
104+
"testHTMLNoisyMetaEncoding_2.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
105+
"testFooter.odt": "cd5d0fcbcf48d6f005d087c47d00e84f39bcc321",
106+
"testPPT.pptm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
107+
"testPPT_various.ppt": "399e27a9893284f106dc44f15b5e636454db681e",
108+
"testRTFListMicrosoftWord.rtf": "0303eb3e2f30530621a7a407847b759a3b21467e",
109+
"testWORD_bold_character_runs2.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
110+
"boilerplate-whitespace.html": "a9372bc75d7d84cbcbb0bce68fcaed73ad8ef52c",
111+
"testEXCEL_95.xls": "20d9b9b0f3aecd28607516b4b837c8bab3524b6c",
112+
"testPPT_embedded_two_slides.pptx": "",
113+
"testPDF_bookmarks.pdf": "5fc486c443511452db4f1aa6530714c6aa49c831",
114+
"test_recursive_embedded.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9",
115+
"testEXCEL-formats.xlsx": "",
116+
"testPPT_masterText2.pptx": "2b01eab5d0349e3cfe791b28c70c2dbf4efc884d",
117+
"test.doc": "774be3106edbb6d80be36dbb548d62401dcfa0fe",
118+
"test_recursive_embedded_npe.docx": "afc32b07ce07ad273e5b3d1a43390a9d2b6dd0a9",
119+
"testPPT_embedded2.ppt": "80e106b3fc68107e7f9579cff04e3b15bdfc557a",
120+
"testWORD_custom_props.docx": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe",
121+
"testPDF_Version.4.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
122+
"testBinControlWord.rtf": "ef858fbb7584ea7f92ffed8d0a08c1cc35ffee07",
123+
"testWORD_null_style.docx": "0be9dcfb83423c78a06af514ec21e4e7770ec48e",
124+
"test-outlook2003.msg": "bb3c35eb7e95d657d7977c1d3d52862734f9f329",
125+
"testPDFVarious.pdf": "c66bbbacb10dd27430f7d0bed9518e75793cedae",
126+
"testHTMLNoisyMetaEncoding_3.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
127+
"testRTFCorruptListOverride.rtf": "116a782d02a7f25010a15cbbb189bf98e6b89855",
128+
"testEXCEL_custom_props.xls": "b5584d9b13ab1566ce539238dc75e7eb3449ba7f",
129+
"testPDF_Version.7.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
130+
"testPDFEmbeddingAndEmbedded.docx": "e7b648adb15cd16cdd84437c2b9524a8eeb213e4",
131+
"testHTMLNoisyMetaEncoding_1.html": "630e14e3495a78580c4e26fa3bbe3123ccf4fd8a",
132+
"testWORD_3imgs.doc": "818aa8c6c44dd78c49100c3c38e95abdf3812981",
133+
"testRTFEmbeddedLink.rtf": "2720ffb5ff3a6bbb2c5c1cb43fb4922362ed788a",
134+
"testKeynote.key": "11387b59fc6339bb73653fcbb26d387521b98ec9",
135+
"testPDF.pdf": "5a377554685367764eaf73d093408ace323fcec7",
136+
"protectedSheets.xlsx": "",
137+
"testWORD.doc": "cdd41377e699287cbbe17fbb1498cfe5814dde23",
138+
"testComment.xlsx": "d4be580bb97c1c90be379281179c7932b37a18c0",
139+
"testPDFPackage.pdf": "75d6fa216b4e2880a65ced55d17ca2b599d2606c",
140+
"testWORD_embeded.doc": "",
141+
"testHTML.html": "6548b16c5ea33e907577615ce60ca4876a3936ef",
142+
"testEXCEL_5.xls": "a174f098333c659d331317641d4d1d9d83055288",
143+
"pictures.ppt": "95bbfdbf2f60f74371285c337d3445d0acd59a9b",
144+
"testPPT_masterText2.ppt": "f5ff5e2d45ccb180cf371ed99b7dfeb2a93539b3",
145+
"testPDF-custommetadata.pdf": "a84b914655db55574e6002b6f37209ecd4c3d462",
146+
"testWORD_embeded.docx": "",
147+
"testStyles.odt": "c25dd05633e3aab7132d2f5608126e2b4b03848f",
148+
"testPDF_multiFormatEmbFiles.pdf": "2103b2c30b44d5bb3aa790ab04a6741a10ea235a",
149+
"testXML2.xml": "a8c85a327716fad93faa4eb0f993057597d6f471",
150+
"testPagesComments.pages": "cbb45131cf45b9c454e754a07af3ae927b1a69cc",
151+
"testEXCEL_4.xls": "8d5e6156222151faaccb079d46ddb5393dd25771",
152+
"testWORD_no_format.doc": "88feaf03fe58ee5cc667916c6a54cbd5d605cc1c",
153+
"testPages.pages": "288e6db2f39604e372a2095257509c78dba22cbb",
154+
"footnotes.docx": "33b01b73a12f9e14efbcc340890b11ee332dca8e",
155+
"testWORD_bold_character_runs.doc": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
156+
"testWORD_custom_props.doc": "e7a737a5237a6aa9c6b3fc677eb8fa65c30d6dfe",
157+
"testPDF_Version.11.x.PDFA-1b.pdf": "71853c6197a6a7f222db0f1978c7cb232b87c5ee",
158+
"testAnnotations.pdf": "5f599e7916198540e1b52c3e472a525f50fd45f6",
159+
"tika434.html": "7d74122631f52f003a48018cc376026ccd8d984e",
160+
"testPagesHeadersFootersAlphaLower.pages": "fc1d766908134ff4689fa63fa3e91c3e9b08d975",
161+
"testRTFRegularImages.rtf": "756b1db45cb05357ceaf9c8efcf0b76e3913e190",
162+
"testRTFUmlautSpaces2.rtf": "1fcd029357062241d74d789e93477c101ff24e3f",
163+
"testWORD_numbered_list.doc": "e06656dd9b79ac970f3cd065fa8b630a4981556f",
164+
"testPPT_autodate.ppt": "05b93967ea0248ad263b2f24586e125df353fd3d",
165+
"testBulletPoints.key": "92242d67c3dbc1b22aac3f98e47061d09e7719f9",
166+
"testMasterSlideTable.key": "1d61e2fa3c3f3615500c7f72f62971391b9e9a2f",
167+
"testWORD_various.doc": "8cbdf1a4e0d78471eb90403612c4e92866acf0cb",
168+
"testEXCEL_textbox.xlsx": "1e81121e91e58a74d838e414ae0fc0055a4b4100",
169+
"big-preamble.html": "a9d759b46b6c6c1857d0d89c3a75ee2f3ace70c9",
170+
"testWORD.docx": "f72140bef19475e950e56084d1ab1cb926697b19",
171+
"testComment.rtf": "f6351d0f1f20c4ee0fff70adca6abbc6e638610e",
172+
"testRTFUnicodeUCNControlWordCharacterDoubling.rtf": "3e6f2f38682e38ffc96a476ca51bec2291a27fa7",
173+
"testPDF_Version.5.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
174+
"testPPTX_Thumbnail.pptx": "6aa019154289317c7b7832fe46556e6d61cd0a9f",
175+
"testRTFTableCellSeparation.rtf": "5647290a3197c1855fad10201dc7be60ea7b0e42",
176+
"testRTFControls.rtf": "aee6afb80e8b09cf49f056020c037f70c2757e49",
177+
"testEXCEL.xls": "",
178+
"testRTFJapanese.rtf": "08976f9a7d6d3a155cad84d7fa23295cb972a17a",
179+
"testPageNumber.pdf": "96b03d2cc6782eba653af28228045964e68422b5",
180+
"testOptionalHyphen.pdf": "12edd450ea76ea4e79f80ebd3442999ec2180dbc",
181+
"testPDFFileEmbInAnnotation.pdf": "97a6e5781bbaa6aea040546d797c4916f9d90c86",
182+
"testFontAfterBufferedText.rtf": "d1c8757b3ed91f2d7795234405c43005868affa3",
183+
"testPPT_masterFooter.ppt": "8c9104385820c2631ddda20814231808fac03d4d",
184+
"testWORD_various.docx": "189df989e80afb09281901aefc458c6630a8530b",
185+
"testComment.ppt": "21842dd9cb8a7d4af0f102543c192861c9789705",
186+
"testPopupAnnotation.pdf": "1717b1d16c0a4b9ff5790cac90fc8e0fba170a35",
187+
"testWORD_bold_character_runs.docx": "f10e562d8825ec2e17e0d9f58646f8084a658cfa",
188+
"testOverlappingText.pdf": "726da7d6c184512ed8d44af2a5085d65523c4572",
189+
"testRTF.rtf": "91e830ceba556741116c9e83b0c69a0d6c5c9304",
190+
"testRTFIgnoredControlWord.rtf": "1eb6a2f2fd32b1bb4227c0c02a35cb6027d9ec8c",
191+
"testComment.xls": "4de962f16452159ce302fc4a412b06a06cf9a0f6",
192+
"testPPT.ppsm": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
193+
"boilerplate.html": "b3558f02c3179e4aeeb6057594d87bda79964e7b",
194+
"testEXCEL_embeded.xls": "",
195+
"testEXCEL.xlsx": "",
196+
"testPPT_2imgs.ppt": "9a68072ffcf171389e78cf8bc018c4b568a6202d",
197+
"testComment.pptx": "6ae6052f469b8f901fd4fd8bc70f8e267255a58e",
198+
"testPDF_Version.6.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
199+
"testPPT.pptx": "71333ef84f7825d8ad6aba2ba993d04b4bab41c6",
200+
"testPPT_custom_props.ppt": "edf196acc12701accc7be5dfe63e053436db45e6",
201+
"testPPT_embeded.pptx": "",
202+
"testRTFListLibreOffice.rtf": "4c38d9e2f0a8c9a4c2cc8d2a52db9591ab759abe",
203+
"testPDF_Version.9.x.pdf": "03b60dfc8c103dbabeedfd682e979f96dd8983a2",
204+
"testRTFHexEscapeInsideWord.rtf": "6cffda07e774c55b5465d8134a0bdcb8c30f3386",
205+
"testRTFNewlines.rtf": "2375ca14e2b0d8f7ff6bbda5191544b3ee7c09fb",
206+
"testRTF-ms932.rtf": "5f9db1b83bf8e9c4c6abb065adaeb151307d33f2",
207+
"test_TIKA-1251.doc": "5a9394c34274964055fdd9272b4f7dc314b99ecf",
208+
"test_list_override.rtf": "9fe8b4a36c5222fe7ed2e9b54e2330aec8fa9423"
209+
}

0 commit comments

Comments
 (0)