@@ -174,7 +174,12 @@ public Collection<GeoRecord> getGeoRecords( Collection<String> accessions ) thro
174
174
175
175
private void getGeoBasicRecords ( List <GeoRecord > records , String searchUrlString ) throws IOException {
176
176
URL searchUrl = new URL ( searchUrlString );
177
- Document searchDocument = parseMiniMLDocument ( searchUrl );
177
+ Document searchDocument ;
178
+ try {
179
+ searchDocument = parseMiniMLDocument ( searchUrl );
180
+ } catch ( EmptyXmlDocumentException e ) {
181
+ throw new RuntimeException ( "Empty MINiML document for " + searchUrl , e );
182
+ }
178
183
179
184
NodeList countNode = searchDocument .getElementsByTagName ( "Count" );
180
185
Node countEl = countNode .item ( 0 );
@@ -201,8 +206,8 @@ private void getGeoBasicRecords( List<GeoRecord> records, String searchUrlString
201
206
t .start ();
202
207
203
208
NodeList accNodes , titleNodes , sampleNodes , dateNodes , orgnNodes , platformNodes , summaryNodes , typeNodes , pubmedNodes ;
204
- Document summaryDocument = parseMiniMLDocument ( fetchUrl );
205
209
try {
210
+ Document summaryDocument = parseMiniMLDocument ( fetchUrl );
206
211
accNodes = ( NodeList ) xaccession .evaluate ( summaryDocument , XPathConstants .NODESET );
207
212
titleNodes = ( NodeList ) xtitle .evaluate ( summaryDocument , XPathConstants .NODESET );
208
213
sampleNodes = ( NodeList ) xnumSamples .evaluate ( summaryDocument , XPathConstants .NODESET );
@@ -212,6 +217,8 @@ private void getGeoBasicRecords( List<GeoRecord> records, String searchUrlString
212
217
summaryNodes = ( NodeList ) xsummary .evaluate ( summaryDocument , XPathConstants .NODESET );
213
218
typeNodes = ( NodeList ) xtype .evaluate ( summaryDocument , XPathConstants .NODESET );
214
219
pubmedNodes = ( NodeList ) xpubmed .evaluate ( summaryDocument , XPathConstants .NODESET );
220
+ } catch ( EmptyXmlDocumentException e ) {
221
+ throw new RuntimeException ( "Empty MINiML document for " + fetchUrl , e );
215
222
} catch ( XPathExpressionException e ) {
216
223
throw new RuntimeException ( String .format ( "Failed to parse XML for %s" , fetchUrl ), e );
217
224
}
@@ -282,7 +289,12 @@ public Collection<GeoRecord> getAllGEOPlatforms() throws IOException {
282
289
}
283
290
284
291
URL searchUrl = new URL ( searchUrlString );
285
- Document searchDocument = parseMiniMLDocument ( searchUrl );
292
+ Document searchDocument ;
293
+ try {
294
+ searchDocument = parseMiniMLDocument ( searchUrl );
295
+ } catch ( EmptyXmlDocumentException e ) {
296
+ throw new RuntimeException ( "Got an empty MINiML document for " + searchUrl , e );
297
+ }
286
298
287
299
NodeList countNode = searchDocument .getElementsByTagName ( "Count" );
288
300
Node countEl = countNode .item ( 0 );
@@ -309,18 +321,19 @@ public Collection<GeoRecord> getAllGEOPlatforms() throws IOException {
309
321
StopWatch t = new StopWatch ();
310
322
t .start ();
311
323
312
- Document summaryDocument = parseMiniMLDocument ( fetchUrl );
313
324
NodeList accNodes , titleNodes , dateNodes , orgnNodes , summaryNodes , techNodes ;
314
325
try {
326
+ Document summaryDocument = parseMiniMLDocument ( fetchUrl );
315
327
accNodes = ( NodeList ) xPlataccession .evaluate ( summaryDocument , XPathConstants .NODESET );
316
328
titleNodes = ( NodeList ) xtitle .evaluate ( summaryDocument , XPathConstants .NODESET );
317
329
summaryNodes = ( NodeList ) xsummary .evaluate ( summaryDocument , XPathConstants .NODESET );
318
330
techNodes = ( NodeList ) xPlatformTech .evaluate ( summaryDocument , XPathConstants .NODESET );
319
331
orgnNodes = ( NodeList ) xorganisms .evaluate ( summaryDocument , XPathConstants .NODESET );
320
332
dateNodes = ( NodeList ) xreleaseDate .evaluate ( summaryDocument , XPathConstants .NODESET );
333
+ } catch ( EmptyXmlDocumentException e ) {
334
+ throw new RuntimeException ( "Got an empty MINiML document for " + fetchUrl , e );
321
335
} catch ( XPathExpressionException e ) {
322
- log .error ( "Could not parse data: " + searchUrl , e );
323
- return Collections .emptyList ();
336
+ throw new RuntimeException ( "Could not parse data for " + searchUrl , e );
324
337
}
325
338
326
339
// consider n_samples (number of elements) and the number of GSEs, but not every record has them, so it would be trickier.
@@ -352,8 +365,8 @@ public Collection<GeoRecord> getAllGEOPlatforms() throws IOException {
352
365
353
366
/**
354
367
* Provides more details than getRecentGeoRecords. Performs an E-utilities query of the GEO database with the given
355
- * searchTerms (search terms can be ommitted ). Returns at most pageSize records. Does some screening of results for
356
- * expression studies, and (optionally) taxa. This is used for identifying data sets for loading
368
+ * searchTerms (search terms can be omitted ). Returns at most pageSize records. Does some screening of results for
369
+ * expression studies, and (optionally) taxa. This is used for identifying data sets for loading.
357
370
*
358
371
* @param start start an offset to retrieve batches
359
372
* @param pageSize page size how many to retrive
@@ -392,7 +405,12 @@ public List<GeoRecord> getGeoRecordsBySearchTerm( String searchTerms, int start,
392
405
}
393
406
394
407
URL searchUrl = new URL ( searchUrlString );
395
- Document searchDocument = parseMiniMLDocument ( searchUrl );
408
+ Document searchDocument ;
409
+ try {
410
+ searchDocument = parseMiniMLDocument ( searchUrl );
411
+ } catch ( EmptyXmlDocumentException e ) {
412
+ throw new RuntimeException ( "Got an empty MINiML document for " + searchUrl , e );
413
+ }
396
414
397
415
NodeList countNode = searchDocument .getElementsByTagName ( "Count" );
398
416
Node countEl = countNode .item ( 0 );
@@ -422,9 +440,9 @@ public List<GeoRecord> getGeoRecordsBySearchTerm( String searchTerms, int start,
422
440
t .start ();
423
441
int rawRecords = 0 ;
424
442
425
- Document summaryDocument = parseMiniMLDocument ( fetchUrl );
426
443
NodeList accNodes , titleNodes , sampleNodes , dateNodes , orgnNodes , platformNodes , summaryNodes , typeNodes , pubmedNodes ;
427
444
try {
445
+ Document summaryDocument = parseMiniMLDocument ( fetchUrl );
428
446
accNodes = ( NodeList ) xaccession .evaluate ( summaryDocument , XPathConstants .NODESET );
429
447
titleNodes = ( NodeList ) xtitle .evaluate ( summaryDocument , XPathConstants .NODESET );
430
448
sampleNodes = ( NodeList ) xnumSamples .evaluate ( summaryDocument , XPathConstants .NODESET );
@@ -435,6 +453,8 @@ public List<GeoRecord> getGeoRecordsBySearchTerm( String searchTerms, int start,
435
453
typeNodes = ( NodeList ) xtype .evaluate ( summaryDocument , XPathConstants .NODESET );
436
454
pubmedNodes = ( NodeList ) xpubmed .evaluate ( summaryDocument , XPathConstants .NODESET );
437
455
// NodeList sampleLists = ( NodeList ) xsamples.evaluate( summaryDocument, XPathConstants.NODESET );
456
+ } catch ( EmptyXmlDocumentException e ) {
457
+ throw new RuntimeException ( "Got an empty MINiML document for " + fetchUrl , e );
438
458
} catch ( XPathExpressionException e ) {
439
459
throw new RuntimeException ( String .format ( "Failed to parse XML for %s" , searchUrl ), e );
440
460
}
@@ -610,7 +630,7 @@ public List<GeoRecord> getRecentGeoRecords( int startPage, int pageSize ) throws
610
630
* exposed for testing
611
631
*
612
632
*/
613
- void parseMINiML ( GeoRecord record , Document detailsDocument ) throws IOException {
633
+ void parseMINiML ( GeoRecord record , Document detailsDocument ) {
614
634
// e.g. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE180363&targ=gse&form=xml&view=full
615
635
NodeList relTypeNodes ;
616
636
String overallDesign ;
@@ -706,15 +726,15 @@ private void getDetails( GeoRecord record ) {
706
726
*/
707
727
try {
708
728
parseMINiML ( record , parseMiniMLDocument ( miniMLURL ) );
709
- } catch ( IOException e ) {
729
+ } catch ( EmptyXmlDocumentException | IOException e ) {
710
730
log .error ( e .getMessage () + " while processing MINiML for " + record .getGeoAccession ()
711
731
+ ", subseries status will not be determined." );
712
732
}
713
733
}
714
734
715
735
try {
716
736
getSampleDetails ( record );
717
- } catch ( EmptyMinimlDocumentException | IOException e ) {
737
+ } catch ( EmptyXmlDocumentException | IOException e ) {
718
738
log .error ( e .getMessage () + " while processing MINiML for " + record .getGeoAccession ()
719
739
+ ", sample details will not be obtained" );
720
740
}
@@ -761,7 +781,7 @@ private void getMeshHeadings( GeoRecord record ) throws IOException {
761
781
* Fetch and parse MINiML for samples.
762
782
*
763
783
*/
764
- private void getSampleDetails ( GeoRecord record ) throws IOException {
784
+ private void getSampleDetails ( GeoRecord record ) throws EmptyXmlDocumentException , IOException {
765
785
// Fetch miniML for the samples.
766
786
// e.g. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE171682&targ=gsm&form=xml&view=full
767
787
URL sampleMINIMLURL = new URL ( "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&form=xml&view=full&acc=" + urlEncode ( record .getGeoAccession () ) );
@@ -806,18 +826,16 @@ private String urlEncode( String s ) {
806
826
* @throws IOException if there is a problem while manipulating the file or if the number of records in the document
807
827
* exceeds {@link #MAX_MINIML_RECORD_SIZE}
808
828
*/
809
- Document parseMiniMLDocument ( URL url ) throws IOException {
829
+ Document parseMiniMLDocument ( URL url ) throws EmptyXmlDocumentException , IOException {
810
830
return parseMiniMLDocument ( url , MAX_RETRIES , null );
811
831
}
812
832
813
- private Document parseMiniMLDocument ( URL url , int maxRetries , @ Nullable IOExceptionWithRetry errorFromPreviousAttempt ) throws IOException {
833
+ private Document parseMiniMLDocument ( URL url , int maxRetries , @ Nullable IOExceptionWithRetry errorFromPreviousAttempt ) throws EmptyXmlDocumentException , IOException {
814
834
try ( InputStream is = openUrlWithMaxSize ( url , MAX_MINIML_RECORD_SIZE ) ) {
815
835
return GeoBrowser .docFactory .newDocumentBuilder ().parse ( is );
816
836
} catch ( ParserConfigurationException | SAXException e ) {
817
- if ( isCausedByAnEmptyMinimlDocument ( e ) ) {
818
- throw new EmptyMinimlDocumentException ( e );
819
- } else if ( isLikelyCausedByAPrivateGeoRecord ( e ) ) {
820
- throw new LikelyNonPublicGeoRecordException ( e );
837
+ if ( isCausedByAnEmptyXmlDocument ( e ) ) {
838
+ throw new EmptyXmlDocumentException ( e );
821
839
} else {
822
840
throw new RuntimeException ( String .format ( "Failed to parse MINiML from URL %s" , url ), e );
823
841
}
@@ -840,20 +858,20 @@ private Document parseMiniMLDocument( URL url, int maxRetries, @Nullable IOExcep
840
858
}
841
859
}
842
860
861
+ /**
862
+ * Check if the given exception is eligible for being retried.
863
+ * <p>
864
+ * For now, just exclude inputs that are too large from being reattempted.
865
+ */
843
866
private boolean isEligibleForRetry ( IOException e ) {
844
- return !ExceptionUtils .hasCause ( e , MinimlDocumentTooLargeException .class );
845
- }
846
-
847
- private boolean isCausedByAnEmptyMinimlDocument ( Exception e ) {
848
- return e instanceof SAXParseException && e .getMessage ().contains ( "Premature end of file." );
867
+ return !ExceptionUtils .hasCause ( e , InputTooLargeException .class );
849
868
}
850
869
851
870
/**
852
- * GEO delivers an HTML document for non-public datasets
853
- * it's possible for this specific case because we're not querying a dataset in particular
871
+ * Check if an excpetion is caused by an empty MINiML document.
854
872
*/
855
- private boolean isLikelyCausedByAPrivateGeoRecord ( Exception e ) {
856
- return e instanceof SAXParseException && e .getMessage ().contains ( "White spaces are required between publicId and systemId " );
873
+ private boolean isCausedByAnEmptyXmlDocument ( Exception e ) {
874
+ return e instanceof SAXParseException && e .getMessage ().contains ( "Premature end of file. " );
857
875
}
858
876
859
877
/**
@@ -867,8 +885,23 @@ private InputStream openUrlWithMaxSize( URL url, long maxSize ) throws IOExcepti
867
885
return new LimitedInputStream ( inputStream , maxSize ) {
868
886
@ Override
869
887
protected void raiseError ( long pSizeMax , long pCount ) throws IOException {
870
- throw new MinimlDocumentTooLargeException ( String .format ( "Document exceeds %d B." , maxSize ) );
888
+ throw new InputTooLargeException ( String .format ( "Document exceeds %d B." , maxSize ) );
871
889
}
872
890
};
873
891
}
892
+
893
+ private static class InputTooLargeException extends IOException {
894
+ public InputTooLargeException ( String message ) {
895
+ super ( message );
896
+ }
897
+ }
898
+
899
+ /**
900
+ * Exception raised when an empty XML document is encountered.
901
+ */
902
+ private static class EmptyXmlDocumentException extends Exception {
903
+ public EmptyXmlDocumentException ( Throwable cause ) {
904
+ super ( "The XML document was empty" , cause );
905
+ }
906
+ }
874
907
}
0 commit comments