-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
786 lines (603 loc) · 135 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Please note that whilst this template provides a
% preview of the typeset manuscript for submission, it
% will not necessarily be the final publication layout.
%
% letterpaper/a4paper: US/UK paper size toggle
% num-refs/alpha-refs: numeric/author-year citation and bibliography toggle
%\documentclass[letterpaper]{oup-contemporary}
\documentclass[a4paper,num-refs]{oup-contemporary}
%%% Journal toggle; only specific options recognised.
%%% (Only "gigascience" and "general" are implemented now. Support for other journals is planned.)
\journal{gigascience}
\usepackage{color}
\usepackage{graphicx}
\usepackage{siunitx}
\usepackage{cite}
\usepackage{svg}
\usepackage{enumitem}
%\usepackage{courier}
%%% Flushend: You can add this package to automatically balance the final page, but if things go awry (e.g. section contents appearing out-of-order or entire blocks or paragraphs are coloured), remove it!
% \usepackage{flushend}
\usepackage{tikz}
\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;}
% Commands for \textit{CWLProv} Table
\usepackage{mathabx}
\newcommand{\return}{$\drsh$ ~}
\newcommand{\nary}[1]{
~ \return #1
}
\newcommand{\placeholder}[1]{
\textit{#1}
}
% RRIDs like RRID:SCR_008394
% detokenize as we need to escape _
\newcommand{\rrid}[1]{
(\href{http://identifiers.org/rrid/RRID:#1}{\detokenize{#1}})
}
%%% https://texblog.org/2014/10/24/removinghiding-a-column-in-a-latex-table/
\usepackage{array}
\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}}
\title{Sharing interoperable workflow provenance: A review of best practices and their practical application in CWLProv}
\author[1,2,\authfn{1}]{Farah Zaib Khan} %% https://orcid.org/0000-0002-6337-3037
\author[2,3,\authfn{1}]{Stian Soiland-Reyes} %% https://orcid.org/0000-0001-9842-9718
\author[1,\authfn{1}]{Richard O. Sinnott} %% http://orcid.org/0000-0001-5998-222X
\author[1,\authfn{1}]{Andrew Lonie} %% https://orcid.org/0000-0002-2006-3856
\author[3,\authfn{1}]{Carole Goble} %% https://orcid.org/0000-0003-1219-2137
\author[2,\authfn{1}]{Michael R. Crusoe} %% https://orcid.org/0000-0002-2961-9670
\affil[1]{The University of Melbourne, Australia }
\affil[2]{Common Workflow Language Project}
\affil[3]{The University of Manchester, UK}
%%% Author Notes
\authnote{\authfn{1}khanf1@unimelb.edu.au; soiland-reyes@manchester.ac.uk; rsinnott@unimelb.edu.au;
alonie@unimelb.edu.au;
carole.goble@manchester.ac.uk;
mrc@commonwl.org}
%%% Paper category
\papercat{Research}
%%% "Short" author for running page header
\runningauthor{Khan et al.}
%%% Should only be set by an editor
\jvolume{00}
\jnumber{0}
\jyear{2018}
\begin{document}
\begin{frontmatter}
\maketitle
\begin{abstract}
\textbf{Background}: The automation of data analysis in the form of \textit{scientific workflows} has become a widely adopted practice in many fields of research. Computationally driven data-intensive experiments using workflows enable Automation, Scaling, Adaption and Provenance support (ASAP). However, there are still several challenges associated with the effective sharing, publication and reproducibility of such workflows due to the incomplete capture of provenance and lack of interoperability between different technical (software) platforms.
\newline
\textbf{Results}: Based on best practice recommendations identified from literature on workflow design, sharing and publishing, we define a hierarchical provenance framework to achieve uniformity in the provenance and support comprehensive and fully re-executable workflows equipped with domain-specific information. To realise this framework, we present \textit{CWLProv}, a standard-based format to represent any workflow-based computational analysis to produce workflow output artefacts that satisfy the various levels of provenance. We utilize open source community-driven standards; interoperable workflow definitions in Common Workflow Language (CWL), structured provenance representation using the W3C PROV model, and resource aggregation and sharing as workflow-centric Research Objects (RO) generated along with the final outputs of a given workflow enactment. We demonstrate the utility of this approach through a practical implementation of \textit{CWLProv} and evaluation using real-life genomic workflows developed by independent groups.
\newline
\textbf{Conclusions}: The underlying principles of the standards utilized by \textit{CWLProv} enable semantically-rich and executable Research Objects that capture computational workflows with retrospective provenance such that any platform supporting CWL will be able to understand the analysis, re-use the methods for partial re-runs, or reproduce the analysis to validate the published findings.
\end{abstract}
\begin{keywords}
Provenance; Common Workflow Language; CWL; Research Object; RO; BagIt; Interoperability; Scientific Workflows; Containers
\end{keywords}
\end{frontmatter}
\section{Introduction} \label{sec:introduction}
Out of the many big data domains, genomics is considered \textit{``the most demanding''} with respect to all stages of the data lifecycle - from acquisition, storage, distribution and analysis \citep{stephens_2015}. As genomic data is growing at an unprecedented rate due to improved sequencing technologies and reduced cost, it is currently challenging to analyse the data at a rate matching its production. With data growing exponentially in size and volume, the practice to perform computational analyses using \textit{workflows} has overtaken more traditional research methods using ad-hoc scripts which were the typical modus operandi over the last few decades \citep{atkinson_2017, Spjuth2015}. Scientific workflow design and management has become an essential part of many computationally driven data-intensive analyses enabling Automation, Scaling, Adaptation and Provenance support (ASAP)\citep{cuevasvicenttn_2012}. Increased use of workflows has driven rapid growth in the number of computational data analysis Workflow Management Systems (WMSs), with hundreds of heterogeneous approaches now existing for workflow specification and execution \citep{cwl-existing-workflow-systems}. There is an urgent need for a common format and standard to define workflows and enable sharing of analysis results using a given workflow environment.
\begin{keypoints*} \label{contributions}
The contribution of this paper is fourfold:
\begin{itemize}
\item We have gathered best-practice recommendations from the existing literature, and reflect on the various authors' experiences with workflow managements systems and especially with regards to factors to consider when a computational analysis is designed, executed and shared.
\item Combining the above with our own experiences from empirical studies \citep{kanwal_2017, wolstencroft_2013, moller_2017, Alterovitz2019}, we define a set of hierarchical levels of provenance tracking and method sharing where the highest level represent complete understanding of the shared resources supported by reproducibility and re-use of the methods from the lower levels.
\item Building on this provenance hierarchy, we define \textit{CWLProv} for the methodical representation of artefacts associated with a given workflow enactment associated with any study involving computational data-intensive analysis.
\item Finally, we demonstrate the utilisation of \textit{CWLProv} by extending an existing workflow execution engine \textit{cwltool} \citep{cwltool} to produce workflow-centric Research Objects generated as a result of a given workflow enactment. We illustrate this through a case study of using workflows designed by external (independent) developers, and subsequently evaluate the interoperability, reproducibility and completeness of the generated \textit{CWLProv} outcome.
\end{itemize}
\end{keypoints*}
Common Workflow Language (CWL) \citep{cwl} has emerged as a workflow definition standard designed to enable portability, interoperability and reproducibility of analyses between workflow platforms. CWL has been widely adopted by more than 20 organisations, providing an interoperable bridge overcoming the heterogeneity of workflow environments.
Whilst a common standard for workflow definition is an important step towards interoperable solutions for workflow specifications, sharing and publishing the \emph{results} of these workflow enactments in a common format is equally important. Transparent and comprehensive sharing of experimental designs is critical to establish trust and ensure authenticity, quality and reproducibility of any workflow-based research result. Currently there is no common format defined and agreed upon for interoperable workflow archiving or sharing \citep{Ivie2018}.
In this paper, we utilize open-source standards such as CWL together with related efforts such as Research Objects (ROs) \citep{belhajjame_2015}, BagIt \citep{bagit17} and PROV \citep{Missier2013} to define \textbf{\textit{CWLProv}}, a format for the interoperable representation of a CWL workflow enactment. We focus on production of a workflow-centric executable RO as the final result of a given CWL workflow enactment. This RO is equipped with the artefacts used in a given execution including the workflow inputs, outputs and, most importantly, the retrospective provenance. This approach enables the complete sharing of a computational analysis such that any future CWL-based workflow can be re-run given the best practices discussed later for software environment provision are followed.
The concept of workflow-centric ROs has been previously considered \citep{belhajjame_2015, hettne_2014, belhajjame_2012} for structuring the analysis methods and aggregating digital resources utilized in a given analysis. The generated ROs in these studies typically aggregate data objects, example inputs, workflow specifications, attribution details, details about the execution environment amongst various other elements. These previous efforts were largely tied to a single platform or a single WMS. \textit{CWLProv} aims to provide a platform-independent solution for workflow sharing, enactment and publication. All the standards and vocabularies used to design \textit{CWLProv} have an overarching goal to support a domain-neutral and interoperable solution (detailed in Section \textbf{\nameref{sec:standards}}).
The contribution of this work are summarized and listed in the \textbf{Key Points} section and the remainder of this paper is structured as follows. In Section \textbf{\nameref{sec:background}} we discuss the key concepts and related work followed by a summary of the published best-practices and recommendations for workflow representation and sharing in Section \textbf{\nameref{sec:levels}}. This section also details the hierarchical provenance framework that we define to provide a principled approach for provenance capture and method sharing. Section \textbf{\nameref{sec:CWLProv}} introduces \textit{CWLProv} and outlines its format, structure and the details of the standards and ontologies it utilizes. Section \textbf{\nameref{sec:demo}} presents the implementation details of \textit{CWLProv} using \textit{cwltool} \citep{cwltool} and Section \textbf{\nameref{sec:evaluation}} demonstrates and evaluates the implemented module for three existing workflow case studies. We discuss the challenges of interoperable workflow sharing and the limitations of the proposed solution listing several possible future research directions in Section \textbf{\nameref{sec:discussion}} before finally drawing conclusions on the work as a whole in Section \textbf{\nameref{sec:conclusion}}.
\section{Background and Related Work} \label{sec:background}
This work draws upon a range of topics as \textit{Provenance} and \textit{Interoperability}. We define these here to provide better context for the reader.
\subsection{Provenance} \label{sec:provenance}
A number of studies have advocated the need for complete provenance tracking of scientific workflows to ensure transparency, reproducibility, analytic validity, quality assurance and attribution of (published) research results \citep{herschel_2017}. The term \textit{Provenance} is defined by World Wide Web Consortium (W3C) \citep{PROVDM} as:
\begin{quote}
\centering
\textit{``Provenance is information about entities, activities, and people involved in producing a piece of data or thing, which can be used to form assessments about its quality, reliability or trustworthiness.''}
\end{quote}
Provenance for workflows is commonly divided into the following three categories: \textit{Retrospective Provenance}; \textit{Prospective Provenance} and \textit{Workflow Evolution}. \textit{Retrospective Provenance} refers to the detailed record of the implementation of a computational task including the details of every executed process together with comprehensive information about the execution environment used to derive a specific product. \textit{Prospective Provenance} refers to the ‘recipes’ used to capture a set of computational tasks and their order, e.g. the workflow specification \citep{clifford_2008}. This is typically given as an abstract representation of the steps (tools/data analysis steps) that are necessary to create a particular research output, e.g. a data artefact. \textit{Workflow Evolution} refers to tracking of any alteration in the existing workflow resulting in another version of the workflow that may produce either the same or different resultant data artefacts \citep{Casati1998}. In this work, our focus is mainly on improving representation and capture of \textit{Retrospective Provenance}.
\subsection{Interoperability} \label{sec:interoperability}
The concept of interoperability varies in different domains. Here we focus on \textit{computational interoperability} defined as:
\begin{quote}
\centering
\textit{The ability of two or more components or systems to exchange information and to use the information that has been exchanged} \citep{interope55:online}.
\end{quote}
The focus of this study is to propose and devise methods to achieve \textit{syntactic}, \textit{semantic} and \textit{pragmatic} interoperability as defined in Levels of Conceptual Interoperability Model (LCIM)\citep{Tolk}. \textit{Syntactic} interoperability is achieved when a common data format for information exchange is unambiguously defined. The next level of interoperability, referred to as \textit{semantic} interoperability, is reached when the content of the actual information exchanged is unambiguously defined. Once there is an agreement about the format and content of the information, \textit{pragmatic} interoperability is achieved when the context, application and use of the shared information and data exchanged is also unambiguously defined. In the section \textbf{\nameref{sec:eval-results}}, we relate these general definitions to specific workflow applications with respect to workflow-centric ROs and describe to what extent these interoperability requirements are addressed.
\subsection{Related Work} \label{sec:relwork}
We focus on relevant studies and efforts trying to resolve the issue of availability of required resources used in a given computational analysis. In addition, we cover efforts directed towards provenance capture of workflow enactments. As these concepts have been around for a considerable time, we restrict our attention to scientific workflows and studies related to the bioinformatics domain.
\subsubsection{\textcolor{black} Workflow Software Environment Capture}
\textit{Freezing} and packaging the run-time environment to encompass all the software components and their dependencies used in an analysis is a recommended and widely adopted practice \citep{cohen2017scientific} especially after use of cloud computing resources where images and snapshots of the cloud instances are created and shared with fellow researchers \citep{howe}. Nowadays, preservation and sharing of the software environment e.g. in open access repositories, is becoming a regular practice in the workflow domain as well. Leading platforms managing infrastructure and providing cloud computing services and configuration on demand include DigitalOcean \citep{DigitalOcean}, Amazon Elastic Compute Cloud \citep{AmazonEC}, Google Cloud Platform \citep{GoogleCl} and Microsoft Azure \citep{Microsof}. The instances launched on these platforms can be saved as snapshots and published with an analysis study to later re-create an instance representing the computing state at analysis time.
Using \textit{``System-wide packaging''} for data-driven analyses, although simplest on part of the workflow developers and researchers, has its own caveats. One of the notable issue is the size of the snapshot as it captures everything in an instance at a given time, hence the size can range from few gigabytes to many terabytes. To distribute research software and share execution environments, various light-weight and container-based virtualisation and package managers are emerging, including: Docker, Singularity, Debian Med and Bioconda.
\textit{Docker}\citep{docker} is a lightweight container-based virtualisation technology that facilitates the automation of application development by archiving software systems and environment to improve portability of the applications on many common platforms including Linux, Microsoft windows, Mac OS X and cloud instances. \textit{Singularity}\citep{kurtzer_2017} is also a cross-platform open source container engine specifically supporting High Performance Computing (HPC) resources. An existing Docker format software image can be imported and used by the Singularity container engine. \textit{Debian Med} \citep{debian-med} contribute packages of medical practice and biomedical research to the Debian Linux distribution, lately also including workflows \citep{moller_2017}. \textit{Bioconda}\citep{Grning2018} packages, based on the an open source package manager Conda \citep{Conda}, are available for Mac OS X and Linux environments, directing towards availability and portability of software used in the life science domain.
\subsubsection{\textcolor{black} Data/Method Preservation, Aggregation \& Sharing}
Preserving and sharing only the software environment is not enough to verify results of any computational analysis or re-use the methods used (e.g. workflows) with a different dataset. It is also necessary to share other details including data (example or the original), scripts, workflow files, input configuration settings, the hypothesis of the experiment and any/all trace/logging information related to ``what happened'', i.e. the retrospective provenance of the actual workflow enactment. The publishing of resources to improve state of scholarly publications is now supported by various online repositories, including Zenodo \citep{Zenodo}, GitHub \citep{GitHub}, myExperiment \citep{Goble2010} and Figshare \citep{figshare}. These repositories facilitate collaborative research, in addition to public sharing of source code and the results of a given analysis. There is however no agreed format that must be followed when someone shares artefacts associated with an analysis. As a result, the quality of the shared resources can range from a highly annotated, properly documented and complete set of artefacts, to raw data with undocumented code and incomplete information about the analysis as a whole. Individual organisations or groups might provide a set of ``recommended practices'', e.g. in readme files, to attempt to maintain the quality of shared resources. The initiative \textit{Code as a Research Object} \citep{CodeasaR} is a joint project between Figshare, GitHub and Mozilla Science Lab \citep{Mozilla} and aims to archive any GitHub code repository to Figshare and produce a Digital Object Identifier (DOI) to improve the discovery of resources\footnote{For the source code that support this work we have used a similar publishing feature with Zenodo.}.
Reprozip\citep{Chirigati2016} aims to resolve portability issues by identifying and packaging all dependencies in a self-contained package which when unpacked and executed on another system (with Reprozip installed) should reproduce the methods and results of the analysis. Each package also contains a human readable configuration file containing provenance information obtained by tracing system calls during system execution. The corresponding provenance trace is however not formatted using existing open standards established by the community.
Several platform-dependent studies have been targeted towards extensions to existing standards by implementing the Research Object model and improving aggregation of resources. \citet{belhajjame_2015} proposed the application of ROs to develop workflow-centric ROs containing data and metadata to support the understandability of the utilized methods (in this case workflow specifications). They explored five essential requirements to workflow preservation and identified data and metadata that could be stored to satisfy the said requirements. These requirements include providing example data, preserving workflows with provenance traces, annotating workflows, tracking the evolution in workflows and packaging the auxiliary data and information with workflows. They proposed extensions to existing ontologies such as Object Reuse and Exchange (ORE), the Annotation Ontology (AO) and PROV-O, with four additional ontologies to represent workflow specific information. However, as stated in the paper, the scope of the proposed model at that time was not focused on interoperability of heterogeneous workflows as it was demonstrated for a workflow specific to Taverna WMS using myExperiment, which makes it quite platform-dependent.
A domain-specific solution is proposed by \citet{gomezperez_2017} by extending the RO model to equip workflow-centric ROs with information catering for the specific needs of the Earth Science community, resulting in enhanced discovery and reusability by experts. They demonstrated that the principles of ROs can support extensions to generate aggregated resources leveraging domain specific knowledge. \citet{hettne_2014} used three genomic workflow case studies to demonstrate the utilisation of ROs to capture methods and data supporting querying and useful extraction of information about the scientific investigation under observation. The solution was tightly coupled with the Taverna WMS and hence if shared, would not be reproducible outside of the Taverna environment. Other notable efforts to use ROs for workflow preservation and method aggregation include \citep{wolstencroft_2013} in systems biology, \citep{Custovic799} in clinical settings and \citep{Alterovitz2019} in precision medicine.
\subsubsection{\textcolor{black} Provenance Capture \& Standardization}
A range of standards for provenance representation have been proposed. Many studies have emphasized the need for provenance focusing on aspects such as scalability, granularity, security, authenticity, modelling and annotation \citep{herschel_2017}. They identify the need to support standardized dialogues to make provenance interoperable. Many of these were used as inputs to initial attempts at creating a standard Provenance Model to tackle the often inconsistent and disjointed terminology related to provenance concepts. This ultimately resulted in the specification of the \textit{Open Provenance Model} (OPM)\citep{Moreau2008} together with an open-source model for the governance of OPM \citep{moreau2009governance}. Working towards similar goals of interoperability and standardization of provenance for web technologies, the World Wide Web Consortium (W3C) Provenance Incubator Group \citep{W3CProvWorkingGroup} and the authors of OPM together set the fourth provenance challenge at the International Provenance and Annotation Workshop, 2010 (IPAW'10) that later resulted in \textit{PROV}, a family of documents serving as the conceptual model for provenance capture, its representation, sharing and exchange over the Web \citep{Moreau2015} regardless of the domain or platform. Since then, a number of studies have proposed extensions to this domain-neutral standard. The model is general enough to be adapted to any field and flexible enough to allow extensions for specialized cases.
\citet{michaelides_2016} presented a domain-specific PROV-based solution for retrospective provenance to support portability and reproducibility of a statistical software suite. They captured the essential elements from the log of a workflow enactment and represented them using an intermediate notation. This representation was later translated to PROV-N and used as the basis for the PROV Template System. A Linux specific system provenance approach was proposed in \citep{pasquier_2017} where they demonstrated retrospective provenance capture at the system level. Another project \textit{UniProv} is working to extract information from Unicore middleware and transform it into a PROV-O representation to facilitate the back-tracking of workflow enactments \citep{giesler_2017}. Other notable domain-specific efforts leveraging the established standards to record provenance and context information are \textit{PROV-man} \citep{benabdelkader_2015}, PoeM \citep{PoeM} and micropublications \citep{Clark2014}. Platforms such as VisTrails and Taverna have built in retrospective provenance support. \textit{Taverna} \citep{wolstencroft_2013} implements an extensive provenance capture system \textit{TavernaProv}\citep{tavernaprov}, utilising both PROV ontologies as well as ROs aggregating the resources used in an analysis. \textit{VisTrails}\citep{freire_2012} is an open source project supporting platform-dependent provenance capture, visualisation and querying for extraction of required information about a workflow enactment. \citep{Chirigati2016} provide an overview of PROV terms and how they can be translated from the VisTrails schema and serialized to PROV-XML. \textit{WINGS}\citep{wings2011} can report fine-grained workflow execution provenance as Linked Data using the OPMW ontology \citep{garijo_2017}, which builds on both PROV-O and OPM.
All these efforts are fairly recent and use a standardized approach to provenance capture and hence are relevant to our work on the capture of retrospective provenance. However, our aim is a domain-neutral and platform-independent solution that can be easily adapted for any domain and shared across different platforms and operating systems.
As evident from the literature, there are efforts in progress to resolve the issues associated with effective and complete sharing of computational analysis including both the results and provenance information. These studies range from highly domain-specific solutions and platform-dependent objects to open source flexible interoperable standards. CWL has widespread adoption as a workflow definition standard, hence is an ideal candidate for portable workflow definitions. The next section investigates existing studies focused on workflow-centric science, and summarises best practice recommendations put forward in these studies. From this we define a hierarchical provenance and resource sharing framework.
\begin{figure*} [t!]
\centering
\includegraphics[width=.9\textwidth]{images/recommendations2.png}
\captionsetup{justification=centering}
\caption{Recommendations from Table \ref{tab:recommendation:wide} classified into these categories} \label{fig:recommendationclasses}
\end{figure*}
\begin{table*}[!htbp]
\caption{Summarized recommendations and justifications from literature covering best practices on reproducibility, accessibility, interoperability and portability of workflows}\label{tab:recommendation:wide}
\begin{tabularx}{\linewidth}{p{1.4cm} L L}
\toprule
{\textbf{R.no}} & {\textbf{Recommendations}} & {\textbf{Justifications}}\\
\midrule
R1 \newline\smaller{parameters} & Save and share all parameters used for each software executed in a given workflow (including default values of parameters used) \citep{Nekrutenko2012, garijo_2013, garijo_2017, sandve_2013}. & Impacts on reproducibility of results since different inputs and configurations of the software can produce different results. Different versions of a tool might upgrade the default values of the parameters. \\ \midrule
R2 \newline\smaller{automate} & Avoid manual processing of data and if using \textit{shims} \citep{Mohan2014} then make these part of the workflow to fully automate the computational process \citep{Nekrutenko2012, sandve_2013}. & This ensures the complete capture of the computational process without broken links so that the analysis can be executed without need for performing manual steps. \\ \midrule
R3 \newline\smaller{intermediate} & Include intermediate results where possible when publishing an analysis \citep{garijo_2013, garijo_2017, sandve_2013}. & Intermediate data products can be used to inspect and understand shared analysis when re-enactment is not possible. \\ \midrule
R4 \newline\smaller{sw-version} & Record the exact software versions used \citep{Nekrutenko2012, sandve_2013}. & This is necessary for reproducibility of results as different software versions can produce different results. \\ \midrule
R5 \newline\smaller{data-version} & If using public data (reference data, variant databases), then it is necessary to store and share the actual data versions used \citep{Spjuth2015, kanwal_2017, Nekrutenko2012, sandve_2013} . & This is needed as different versions of data, e.g. human reference genome or variant databases, can result in slightly different results for the same workflow. \\ \midrule
R6 \newline\smaller{annotation} & Workflows should be well-described, annotated and offer associated metadata. Annotations such as user contributed tags and versions should be assigned to workflows and shared when publishing the workflows and associated results \citep{belhajjame_2015, belhajjame_2012, garijo_2017, Littauer2012, stodden_2016} . & Metadata and annotations improve the understandability of the workflow, facilitate independent re-use by someone skilled in the field, make workflows more accessible and hence promote the longevity of the workflows. \\ \midrule
R7 \newline\smaller{identifier} & Use and store stable identifiers for all artefacts including the workflow, the datasets and the software components \citep{Littauer2012, stodden_2016}. & Identifiers play an important role in the discovery, citation and accessibility of resources made available in open-access repositories. \\ \midrule
R8 \newline\smaller{environment} & Share the details of the computational environment \citep{belhajjame_2015, kanwal_2017, stodden_2016} . & Such details support requirements analysis before any re-enactment or reproducibility is attempted. \\ \midrule
R9 \newline\smaller{workflow} & Share workflow specifications/descriptions used in the analysis \citep{belhajjame_2015, garijo_2013, garijo_2017, stodden_2016, Stodden2014}. & The same workflow specifications can be used with different datasets thereby supporting re-usability. \\ \midrule
R10 \newline\smaller{software} & Aggregate the software with the analysis and share this when publishing a given analysis \citep{belhajjame_2015, kanwal_2017, stodden_2016, Stodden2014, garijo_2017}. & Making software available reduces dependence on third party resources and as a result minimizes \textit{workflow decay} \citep{Zhao2012}. \\ \midrule
R11 \newline\smaller{raw-data} & Share raw data used in the analysis \citep{belhajjame_2015, garijo_2013, garijo_2017, stodden_2016, Stodden2014}. & When someone wants to validate published results, availability of data supports verification of claims and hence establishes trust in the published analysis \\ \midrule
R12 \newline\smaller{attribution} & Store all attributions related to data resources and software systems used \citep{garijo_2017, Stodden2014}. & Accreditation supports proper citation of resources used. \\ \midrule
R13 \newline\smaller{provenance} & Workflows should be preserved along with the provenance trace of the data and results \citep{belhajjame_2015, belhajjame_2012, garijo_2017, sandve_2013, Stodden2014}. & A provenance trace provides a historical view of the workflow enactment, enabling end users to better understand the analysis retrospectively \\ \midrule
R14 \newline\smaller{diagram} & Data flow diagrams of the computational analysis using workflows should be provided \citep{kanwal_2017, garijo_2013}. & These diagrams are easy to understand and provide a human readable view of the workflow. \\ \midrule
R15 \newline\smaller{open-source} & Open source licensing for methods, software, code, workflows and data should be adopted instead of proprietary resources \citep{kanwal_2017, garijo_2013, sandve_2013, stodden_2016, Stodden2014, Gymrek2016}. & This improve availability and legal re-use of the resources used in the original analysis, while restricted licenses would hinder reproducibility. \\ \midrule
R16 \newline\smaller{format} & Data, code and all workflow steps should be shared in a format that others can easily understand preferably in a system neutral language \citep{belhajjame_2015, garijo_2013, Gymrek2016}. & System neutral languages help achieve interoperability and make an analysis understandable. \\ \midrule
R17 \newline\smaller{executable} & Promote easy execution of workflows without making significant changes to the underlying environment \citep{Spjuth2015}. & In addition to helping reproducibility, this enables adapting the analysis methods to other infrastructures and improves workflow portability. \\ \midrule
R18 \newline\smaller{resource-use} & Information about compute and storage resources should be stored and shared as part of the workflow \citep{kanwal_2017}. & Such information can assist users in estimating the required resources needed for an analysis and thereby reduce the amount of failed executions. \\ \midrule
R19 \newline\smaller{example} & Example input and sample output data should be preserved and published along with the workflow-based analysis \citep{belhajjame_2015, Zhao2012}. & This information enables more efficient test runs of an analysis to verify and understand the methods used. \\ \midrule
\bottomrule
\end{tabularx}
\begin{tablenotes}
\item This list is not exhaustive, other studies have identified separate issues (e.g. lab work provenance and data security) that are beyond the scope of this work.
\end{tablenotes}
\end{table*}
\section{Levels of Provenance and Resource Sharing} \label{sec:levels}
Various studies have empirically investigated the role of automated computational methods in the form of workflows and published best practice recommendations to support workflow design, preservation, understandability and re-use. We summarise a number of these recommendations and the their justifications in Table \ref{tab:recommendation:wide}, where each recommendation addresses specific requirement of workflow design and sharing. These recommendations can be clustered into broad themes as shown in Figure \ref{fig:recommendationclasses}. This classification can be in more than one way e.g. according to how these recommendations are supporting each FAIR dimension \citep{wilkinson_2016}. In this study, we have focused on categories with respect to workflow design, prospective provenance, data sharing, retrospective provenance, the computational environment required/used for an analysis and lastly better findability and understandability of all shared resources.
Sharing \textit{``all artefacts''} from a computational experiment (following all recommendations and best practices) is a demanding task without any informed guidance. It requires consolidated understanding of the impact of the many different artefacts involved in that analysis. This places extra efforts on workflow designers, (re)-users, authors, reviewers and expectations on the community as a whole. Given the numerous WMS and differences in how each system deals with provenance documentation, representation and sharing of these artefacts, the granularity of provenance information preserved will vary for each workflow definition approach. Hence, devising one universal but technology-specific solution for provenance capture and the related resource sharing is impossible. Instead we propose a generic framework of provenance in Figure \ref{fig:levels} that all WMSs can benefit from and conform to with minimum technical overheads.
The recommendations in Table \ref{tab:recommendation:wide} aid in our understanding to define this framework by classifying the granularity of the provenance and related artefacts where the uppermost level exhibits comprehensive, reproducible, understandable and provenance-rich computational experiment sharing. The purpose of this framework is threefold. First, because of its generic nature it brings the uniformity in the provenance granularity across various WMS belonging to different workflow definition approaches. Second, it provides comprehensive and well-defined guidelines that can be used by the researchers to conduct principled analysis of the provenance of any published study. Third, due to its hierarchical nature, the framework can be leveraged by the workflow authors to progress incrementally towards the most transparent workflow-centric analysis. Overall, this framework will help achieve a uniform level of provenance and resource sharing with a given workflow-centric analysis guaranteed to fulfill the respective provenance applications.
Our proposed provenance levels are ordered from low granularity to higher degrees of specificity. In brief, \textbf{Level 0} is unstructured information about the overall workflow enactment, \textbf{Level 1} adds structured retrospective provenance, access to primary data and executable workflows, \textbf{Level 2} enhances the white-box provenance for individual steps, and \textbf{\textit{Level 3}} adds domain-specific annotations for improved understanding. These levels are described in the following sub-sections and mapped to the requirements in Table \ref{tab:recommendation:wide} that these levels aim to satisfy.
\begin{figure*} %[b!]
\centering
\includegraphics[width=.9\textwidth]{images/ProvenanceLevels}
\captionsetup{justification=centering}
\caption{Levels of Provenance and resource sharing and their applications}\label{fig:levels}
\end{figure*}
\subsection{Level 0} \label{sec:level0}
To achieve this level, researchers should share the workflow specifications, input parameters used for a given workflow enactment, raw logs and output data preferably through an open-access repository. This is the least information that could be shared without putting any extra efforts to support seamless reuse or understandability of a given analysis. The artefacts shared at this level would only require uploading of the associated resources to a repository without necessarily providing any supporting metadata or provenance information. Information captured at \textit{Level 0} is the bare minimum that can be used for result interpretation.
Workflow definitions based on \textit{Level 0} can also potentially be re-purposed for other analyses. As argued by Ludäscher, a well-written scientific workflow and its graphical representation is itself a source of prospective provenance giving user an idea of the steps taken and data produced \citep{ludascher2016brief}. Therefore a well-described workflow specification indirectly provides prospective provenance without aiming for it. In addition to the textual workflow specification, its graphical representation should also be shared if available for better understandability fulfilling \textit{R14-diagram}. At this level, reproducing the workflow would only be possible if the end-user devotes extra efforts to understand the shared artefacts and carefully recreate the execution environment. As open access journals frequently require availability of methods and data, many published studies now share workflow specifications and optionally the outputs thereby achieving \textit{Level 0} and specifically satisfying \textit{R1-parameters} and \textit{R9-workflow} (Table \ref{tab:recommendation:wide}). In addition, the resources shared should have open licence starting from \textit{Level 0} and this practice proposed by \textit{R15-open-source} should be adopted at each higher level.
\subsection{Level 1} \label{sec:level1}
At \textit{Level 1}, \textit{R4-sw-version}, \textit{R5-data-version}, \textit{R12-attribution} and \textit{R13-provenance} should be satisfied by providing retrospective provenance of the workflow enactment - i.e. a structured representation of machine readable provenance which can answer questions such as ``what happened'', ``when happened'', ``what was executed'', ``what was used'', ``who did this'' and ``what was produced''. Seamless re-enactment of the workflow should be supported at this level. This is only possible when along with provenance information, \textit{R8-environment} and \textit{R10-software} is satisfied by potentially packaging the software environment for analysis sharing or there is enough information about the software environment that guide the user to reliably re-enact the workflow. Hence \textit{R17-executable} should be satisfied making it possible for the end users to re-enact the shared analyses without making major changes to the underlying software environment.
In addition to the software availability and retrospective provenance, access to input data should also be provided fulfilling \textit{R11-raw-data}. This data can be used to re-enact the published methods or utilized in a different analysis, e.g. for performance comparison of methods. At \textit{Level 1}, it is preferable to provide content-addressable data artefacts such as input, output and intermediate files, avoiding local paths and file names to make a given workflow executable outside its local environment. The intermediate data artefacts should also be provided to facilitate inspection of all step results, hence satisfying \textit{R3-intermediate}. All resources, including workflow specifications and provenance, should be shared in a format that is understandable across platforms, preferably in a technology-neutral language as proposed by \textit{R16-format}.
While software and data can be digitally captured, the hardware and infrastructure requirements also need to be captured to fulfill \textit{R18-resource-use}. This kind of information can naturally vary widely with runtime environments, architectures and data sizes \citep{Bubak_2013}, as well as rapidly becoming outdated as hardware and cloud offerings evolve. Nevertheless a snapshot of the workflow's overall execution resource usage for an actual run can be beneficial to give a broad overview of the requirements, and can facilitate cost-efficient re-computation by taking advantage of spot-pricing for cloud resources \citep{Angiuoli_2011}.
\subsection{Level 2} \label{sec:level2}
It is a common practice in scientific workflows to modularize the workflow specifications by separating the related tasks into ``sub-workflows'' or ``nested workflows'' \citep{cohen2017scientific} to be incorporated and used in other workflows or be assigned to compute and storage resources in case of distributed computing \citep{chen2011partitioning}. These modular solutions promote understanding and re-usability of the workflows as researchers are inclined to use these modules instead of workflow as whole for their own computational experiments. An example of a sub-workflow is the mandatory ``pre-processing'' \citep{GATKBP} needed for the Genome Analysis ToolKit (GATK) best practice pipelines used for genomic variant calling. These steps can be separated into a sub-workflow to be used before any variant calling pipeline, be it somatic or germline.
At \textit{Level 1}, retrospective provenance is coarse grained and as such, there is no distinction between workflows and their sub-workflows. Ludäscher \citep{ludascher2016brief} distinguishes workflow provenance between \textit{black-box} and database provenance as \textit{white-box}. The reasoning behind this distinction is that often the steps in a workflow, especially those based on graphical user interface-based platforms, provide levels of abstraction/obscurity to the actual tasks being implemented. In our previous work we used an empirical case study to demonstrate that declarative approaches to workflow definition resulted in transparent workflows with the least number of assumptions \citep{kanwal_2017}. This resolves the black box/white box issue to some extent, but to further support research transparency, we propose to share retrospective provenance logs for each nested/sub-workflow making the details of a workflow enactment as explicit as possible and moving a step closer to \textit{white-box} provenance. These provenance logs will support the inspection and automatic re-enactment of targeted components of a workflow such as a single step or a sub-workflow individually without necessarily having to re-enact the full analysis. Some existing make-like systems such as Snakemake support partial re-enactments but typically rely on fixed file paths for input data and require manual intervention to provide the specific directory structure. With detailed provenance logs and the corresponding content-addressable data artefacts, the partial re-runs can be achieved with automatic generation of input configuration setting.
In addition, we propose to include \textit{permalinks} at \textit{Level 2} to identify the workflows and their individual steps which facilitates the inspection of each step and aim to improve the longevity of the shared resources, hence supporting \textit{R7-identifier}. Improving \textit{R18-resource-use} for \textit{Level 2} would include resource usage per task execution. Along with execution times this can be useful information to identify bottlenecks in a workflow and for more complex calculations in cost optimization models \citep{Malawski_2013}. At this provenance level resource usage data will however also become more noisy and highly variant on scheduling decisions by the workflow engine, e.g. sensitivity to cloud instance reuse or co-use for multiple tasks, or variation in data transfers between tasks on different instances. Thus \textit{Level 2} resource usage information should be further processed with statistical models for it to be meaningful for a user keen to estimate the resource requirement for re-enactment of a given analysis.
\subsection{Level 3} \label{sec:level3}
Levels 0-2 are generic and domain-neutral, and can apply to any scientific workflow. However, domain-specific information/metadata about data and processes plays an important role in better understanding of the analysis and exploitation of provenance information, e.g. for meaningful queries to extract information to the domain under consideration \citep{Alper2018, Gaignard2014}. Addition of domain specific metadata e.g. file formats, user-defined tags and other annotations to generic retrospective provenance can improve the \textit{white-boxness} by providing domain context to the analysis as described in \textit{R6-annotations}. Annotations can range from adding textual description and tags to marking data with more systematic and well-defined domain-specific ontologies such as EDAM \citep{Ison2013} and BioSchemas \citep{Michel_2018} in the case of bioinformatic workflows. Some studies also propose to provide example or test data sets which eventually helps in analyzing the methods shared and verifying their results (as described in \textit{R19-example}).
At \textit{Level 3}, the information from previous levels combined with specific metadata about data artefacts facilitates higher level classification of workflow steps into \textit{motifs} \citep{garijo_2014} such as data retrieval, pre-processing, analysis and visualisation. This level of provenance, resource aggregation and sharing can provide a researcher-centric view of data and enable users to re-enact a set of steps or full workflow by providing filtered and annotated view of the execution. This can be non-trivial to achieve with mainstream methods of workflow definition and sharing, as it requires guided user annotations with controlled vocabularies, but this can be simplified by reusing related tooling from existing efforts like BioCompute Objects \citep{Alterovitz2019} and DataCrate \citep{datacrate_2018}.
Communicating resource requirements (\textit{R18-resource-use}) at \textit{Level 3} would involve domain-specific models for hardware use and cost prediction, as suggested for dynamic cloud costing \citep{biosimspacewebinar} in \textit{BioSimSpace} \citep{biosimspace}, or predicting assembler and memory settings through machine learning of variables like source biome, sequencing platform, file size, read count and base count in the \textit{European Bionformatics Institute (EBI) Metagenomics} pipeline \citep{mitchell_2017}. For robustness such models typically need to be derived from resource usage across multiple workflow runs with varied inputs, e.g. by a multi-user workflow platform. Taking advantage of \textit{Level 3} resource usage models might require pre-processing workflow inputs and calculations in an environment like R or Python, and so we recommend that models are provided with separate sidecar workflows for interoperable execution before the main workflow.
By explicit enumeration of the levels of provenance, it should be possible to quantify and directly assess the effort required to re-use a workflow and reproduce experiments directly. Similar effort like \textit{5-star Open Data} \citep{5star} strongly advocates open-licensed structured representation, use of stable identifiers for data sharing and following Linked Data principles to cross-relate data. One challenge on achieving the Open Data stars is that it needs tool support during data processing. In our framework we proposed systematic workflow-centric resource sharing using structured Linked Data representation, including recording of the executed data operations. Hence, our effort compliments the already proposed 5-star Open Data principles and contributes to further understanding by sharing the computational method following the same principles.
Requiring researchers to achieve the above defined levels individually is unrealistic without guidance and direct technical support. Ideally, the conceptual meaning of these levels would be translated into a practical solution utilising the available resources. However, given the heterogeneity of workflow definition approaches, it is expected that the proposed framework, when translated into practical solutions, will also naturally result in varying workflow-centric solutions tied to specific WMSs. To support interoperability of the workflow-centric analysis achieving the provenance levels, we propose \textbf{\textit{CWLProv}}, a format for annotating resource aggregations equipped with retrospective provenance. The next section describes \textit{CWLProv} and the associated standards that are applied in this process.
\section{CWLProv 0.6.0 and utilized standards}\label{sec:CWLProv}
Here we present \textit{CWLProv}, a format for the methodical representation of workflow enactment, associated artefacts and capturing and using retrospective provenance information. Keeping in view the recommendations from Table \ref{tab:recommendation:wide} for example \textit{R15-open-source} and \textit{R16-format}, we leverage \textbf{open-source}, \textbf{domain-independent}, \textbf{system-neutral}, \textbf{interoperable} and most importantly \textbf{community-driven} standards as the basis for the design and formatting of reproducible and interoperable workflow-based ROs. The profile description in this section correspond to \textit{CWLProv} 0.6.0 \citep{cwlprov}. (see \url{https://w3id.org/cwl/prov} for the latest profile).
\subsection{Applied Standards and Vocabularies} \label{sec:standards}
We follow the recommendation \textit{``Reuse vocabularies, preferably standardized ones''} \citep{reusevocab} from best practices associated with data sharing, representation and publication on the web to achieve consensus and interoperability of workflow-based analyses. Specifically we integrate the \emph{Common Workflow Language} (CWL) for workflow definition, \emph{Research Objects} (ROs) for resource aggregation and the \emph{PROV-Data Model} (PROV-DM) to support the retrospective provenance associated with workflow enactment. The key properties and principles of these standards are described below.
\subsubsection{\textcolor{black}Common Workflow Language (CWL)}
Common Workflow Language \citep{cwl} provides declarative constructs for workflow structure and command line tool interface definition. It makes minimal assumptions about base software dependencies, configuration settings, software versions, parameter settings or indeed the execution environment more generally \citep{kanwal_2017}. The CWL object model supports comprehensive recording and capture of information for workflow design and execution. This can subsequently be published as structured information alongside any resultant analysis using that workflow.
CWL is a community-driven standard effort that has been widely adopted by many workflow design and execution platforms, supporting interoperability across a set of diverse platforms. Current adopters include Toil, Arvados, Rabix \citep{kaushik_2017}, Cromwell \citep{cromwell}, REANA, and Bcbio \citep{guimera_2012} with implementations for Galaxy, Apache Taverna, and AWE currently in progress.
\begin{figure*} [t!]
\centering
\includegraphics[width=.7\textwidth]{images/twostep}
\captionsetup{justification=centering}
\caption{Left: A snapshot of part of a GATK workflow described using CWL. Two steps named as \textit{bwa-mem} and \textit{samtools-view} are shown where the former links to the tool description executing the underlying tool (BWA-mem for alignment) and provides the output used as input for samtools. Right: Snapshot of BWA-mem.cwl and the associated Docker requirements for the exact tool version used in the workflow execution.}\label{fig:bwa-mem}
\end{figure*}
A workflow in CWL is composed of “steps” where each step refers either to a command line tool (also specified using CWL) or another workflow specification incorporating the concept of “sub-workflows”. Each “step” is associated with “inputs” that are comprised of any data artefact required for the execution of that step (Figure \ref{fig:bwa-mem}). As a result of the execution of each step, “outputs” are produced which can become (part of) “inputs” for the next steps making the execution data-flow oriented. CWL is not tied to a specific operating system or platform which makes it an ideal approach for interoperable workflow definitions.
\subsubsection{ \textcolor{black}Research Object (RO)}
A Research Object encapsulates all of the digital artefacts associated with a given computational analysis contributing towards preservation of the analysis \citep{bechhofer_2013}, together with their metadata, provenance and identifiers.
The aggregated resources can include but are not limited to: input and output data for analysis results validation; computational methods such as command line tools and workflow specifications to facilitate workflow re-enactment; attribution details regarding users; retrospective as well as prospective provenance for better understanding of workflow requirements, and machine-readable annotations related to the artefacts and the relationships between them. The goal of ROs is to make any published scientific investigation and the produced artefacts \textit{“interoperable, reusable, citable, shareable and portable”}.
The three core principles \citep{roprinciples} of the RO approach are to support ``Identity'', ``Aggregation'', and ``Annotation'' of research artefacts. They look to enable accessibility of tightly-coupled, interrelated and well-understood aggregated resources involved in a computational analysis as identifiable objects, e.g. using unique (persistent) identifiers such as DOIs and/or ORCIDs. The RO approach is well aligned with the idea of interoperable and platform-independent solutions for provenance capture of workflows because of its domain-neutral and platform-independent nature.
While ROs can be serialized in several different ways, in this work we have reused the BDBag approach based on \textit{BagIt} (see box), which has been shown to support large-scale workflow data \citep{chard_2016}. This approach is also compatible with data archiving efforts from the NIH Data Commons, Library of Congress and the Research Data Alliance. The specialized workflow-centric RO in this study encompasses the components mentioned in the previous paragraph annotated with various targeted tools and a PROV-based \textit{Workflow provenance profile} to capture the detailed retrospective provenance of the CWL workflow enactment.
\subsubsection{\textcolor{black}PROV Data Model (PROV-DM)}
The World Wide Web Consortium (W3C) developed \emph{PROV}, a suite of specifications for unified/interoperable representation and publication of provenance information on the Web. The underlying conceptual PROV Data Model (PROV-DM) \citep{PROVDM} provides a domain-agnostic model designed to capture fundamental features of provenance with support for extensions to integrate domain-specific information (Figure \ref{fig:prov-dm}).
\begin{figure} [!b]
\centering
\includegraphics[width=\linewidth]{images/key-concepts.pdf}
\captionsetup{justification=centering}
\caption{Core concepts of the PROV Data Model. \\Adapted from W3C PROV Model Primer \citep{PROVModel}. }\label{fig:prov-dm}
\end{figure}
We utilize mainly two serialisations of PROV for this study, PROV-Notation (PROV-N) \citep{moreau_2013} and PROV-JSON \citep{huynh_2013}. PROV-N is designed to achieve serialisation of PROV-DM instances by formally representing the information using a simplified textual syntax to improve human readability. PROV-JSON is a lightweight interoperable representation of PROV assertions using JavaScript constructs and data types. The key design and implementation principles of these two serialisations of PROV are in compliance with the goals of this study, i.e. understandable and interoperable, hence are a natural choice to support the design of an adaptable provenance profile. For completeness we also explored serializing the provenance graph as PROV-XML \citep{PROVXML} as well as PROV-O \citep{PROVO}, which provides a mapping to Linked Data and ontologies, with potential for rich queries and further integration using a triple store. One challenge here is the wide variety of OWL and RDF formats, we opted for Turtle, N-Triples and JSON-LD, but concluded that requiring all of these PROV and RDF serializations would be an unnecessary burden for other implementations of \textit{CWLProv}.
\subsection{\textit{CWLProv} Research Object} \label{sec:cwlprovRO}
The provenance framework defined in previous section can be satisfied by using a structured approach to share the identified resources. In this section, we define the representation of data and metadata to be shared for a given workflow enactment, stored as multiple files in their native formats. The folder structure of the \textit{CWLProv} Research Object complies with the \emph{BagIt} \citep{bagit17} format such that its content and completeness can be verified with any BagIt tool or library (see box \textbf{What is BagIt?}). The files used and generated by the workflow are here considered the \emph{data payload}; the remaining directories include \emph{metadata} of how the workflow results were created. We systematized the aggregated resources into various collections for better understanding and accessibility for a CWL workflow execution (Figure \ref{fig:RO-format}).
\begin{figure*} %[b!]
\centering
\includegraphics[width=.7\textwidth]{images/RO-structure-NEW-file}
\captionsetup{justification=centering}
\caption{Schematic representation of the aggregation and links between the components of a given workflow enactment. Layers of execution are separated for clarity. The workflow specification and command line tool specifications are described using CWL. Each individual command line tool specification can optionally interact with Docker to satisfy software dependencies. [A] The RO layer shows the structure of the RO including its content and interactions with different components in the RO and [B] the CWL layer. }\label{fig:RO-format}
\end{figure*}
\subsubsection{\textcolor{black}data/}
\texttt{data/} is the \emph{payload} collection of the Research Object, in \textit{CWLProv} this contains all input and output files used in a given workflow enactment. Data files should be labelled and identified based on a hashed checksum rather than derived from its file path during workflow execution. This use of \emph{content-addressable} reference and storage \citep{services_2012} simplifies identifier generation for data and helps to avoid local dependencies, e.g. hard-coded file names. However, the workflow execution engine might use other unique identifiers for file objects. It is advised to re-use such identifiers to avoid redundancy and to comply with the system/platform used to run the workflow.
\subsubsection{ \textcolor{black}workflow/}
\textit{CWLProv} ROs must include a system-independent executable version of the workflow under the \texttt{workflow/} folder. When using CWL, this sub-folder must contain the complete executable \emph{workflow specification} file, an \emph{input file object} with parameter settings used to enact the workflow and an \emph{output file object} generated as a result of workflow enactment. The latter contain details of the workflow outputs such as data files produced by the workflow, but may exclude intermediate outputs.
To ensure RO portability, these file objects may not exactly match the file names at enactment time, as the absolute paths of the inputs are recommended to be replaced with relativized \emph{content-addressed} paths within the RO, e.g. \texttt{/home/alice/exp15/sequence.fa} is replaced with \texttt{../data/b1/b1946ac92492d2347c6235b4d2611184}. The input file object should also capture any dependencies of the input data files, such as \texttt{.bam.bai} indexes neighbouring \texttt{.bam} (\emph{Binary Alignment Map}) files. Any folder objects should be expanded to list contained files and their file names at time of enactment.
In the case of a CWL workflow, \textit{cwltool} can aggregate the CWL description and any referenced external descriptions (such as sub-workflows or command line tool descriptions) into a single workflow file using \texttt{cwltool -{}-pack}. This feature is used in our implementation (details in section \textbf{\nameref{sec:demo}}) to rewrite the workflow files, making them re-executable without depending on workflow or commandline descriptions on the file system outside the RO. Other workflow definition approaches, WMS or CWL executors should apply similar features to ensure workflow definitions are executable outside their original file system location.
\begin{mdframed}[linewidth=1pt,linecolor=black,
innerleftmargin=8pt,innerrightmargin=8pt,
innertopmargin=8.2pt,innerbottommargin=6pt]
{\fontsize{8.2pt}{10pt}\bfseries What is BagIt?\par}
\textbf{BagIt} is an IETF Internet Standard (RFC8493)\citep{bagit17} that defines a structured file hierarchy for the purpose of digital preservation of data files. BagIt was initiated by the US Library of Congress and the California Digital Library, and is now used by libraries and archives to ensure safe transmission and storage of datasets using ``bags''.
A \textbf{bag} is indicated by the presence of \texttt{bagit.txt} and a \emph{payload} of digital content stored as files and sub-folders in the \texttt{data/} folder. Other files are considered \emph{tag files} to further describe the payload. All the payload files are listed in a \emph{manifest} with checksums of their byte content, e.g. \texttt{manifest-sha256.txt} and equivalent for tag files in \texttt{tagmanifest-sha256.txt}. Basic metadata can be provided in \texttt{bag-info.txt} as key-value pairs.
A bag can be checked to be \emph{complete} if all the files listed in the manifests exist, and is also considered \emph{valid} if the manifest matches the checksum of each file, ensuring they have been correctly transferred.
\textbf{BDBag} (Big Data bag)\citep{chard_2016} is a profile of BagIt that adds a \emph{Research Object}\citep{RObundle} \texttt{metadata/manifest.json} in JSON-LD \citep{JSONLD} format to contain richer Linked Data annotations that may not fit well in \texttt{bag-info.txt}, e.g. authors of an individual file. BDBags can include a \texttt{fetch.txt} to reference external resources using \emph{ARK MinIDs} or HTTP URLs, allowing bags that contain large files without necessarily transferring their bytes.
\end{mdframed}
\subsubsection{\textcolor{black}snapshot/}
\texttt{snapshot/} comprises copies of the workflow and tool specifications files “as-is” at enactment time, without any rewrites, packing or relativizing as described above.
It is recommended to use snapshot resources only for validity checking results and for understanding the workflow enactment, since these files might contain absolute paths or be host-specific, and thus may not be possible to re-enact elsewhere. Preserving these files untouched may nevertheless retain information that could otherwise get lost, e.g. commented out workflow code, or identifiers baked into file names.
A challenge in capturing snapshot files is that they typically live within a file system hierarchy which can difficult to replicate accurately, and may have internal references to other files. In our implementation we utilize \texttt{cwltool -{}-print-deps} to find indirectly referenced files and store their snapshots in a flat folder.
\subsubsection{\textcolor{black}metadata/}
Each \textit{CWLProv} RO must contain an RO manifest file \texttt{metadata/manifest.json} and two sub-directories \texttt{metadata/logs} and \texttt{metadata/provenance}. The RO manifest, part of the BDBag \citep{chard_2016} profile, follows the JSON-LD structure defined for Research Object Bundles \citep{RObundle} and can provide structured Linked Data for each file in the RO, like file type and creation date. Further detail about the manifest file contents is documented on GitHub as \textit{CWLProv} specification \citep{cwlprov}.
Any raw log information from the workflow enactment should be made available in \texttt{metadata/logs}. This typically includes the actual commands executed for each step. Similar to the snapshot files, log files may however be difficult to process outside the original enactment system. An example of such processing is \emph{CWL-metrics} \citep{10.1093/gigascience/giz052}, which post-process cwltool log files to capture runtime metrics of individual Docker containers.
Capturing the details of a workflow execution require rich metadata in provenance files (see section \textbf{\nameref{sec:provenanceprofile}}). These should exist in the sub-folder \texttt{metadata/provenance}. It is recommended to make the availability of a \textit{primary} provenance file mandatory, which should conform with the PROV-N \citep{moreau_2013} format. This file describes the top-level workflow execution. As described in \textit{Level 2} (Section \textbf{\nameref{sec:levels}}), it is quite possible to have nested workflows. In that case, a separate provenance file for each nested workflow execution should be included in this folder. If there are additional formats of provenance files such as PROV-JSON \citep{huynh_2013}, PROV-XML \citep{PROVXML}, PROV-O \citep{PROVO} etc, then these should be included in the said folder with a declaration using \texttt{conformsTo} to declare their formats in the RO manifest being mandatory. The nested workflow profile should be named such that there is a link between the respective step in the primary workflow and the nested workflow preferably using unique identifiers.
As the PROV-DM has a generalized structure, there might be some provenance aspects specific to particular workflows that are hard to capture if only using PROV-N, hence ontologies such as \textit{wfdesc} \citep{wf4ever1} can be used to describe the abstract representation of the workflow and its steps. Use of \textit{wfprov} \citep{wf4ever2} to capture some workflow provenance aspects is also encouraged. Alternative extensions such as ProvOne \citep{caoprovone} can also be utilized if the WMS or workflow executor is using these extensions already.
\textit{CWLProv} reuses Linked Data standards like JSON-LD \citep{JSONLD}, W3C PROV \citep{PROVDM} and Research Object \citep{hettne_2014}. A challenge with Linked Data in distributed and desktop computing is how to make identifiers that are absolute URIs and hence globally unique. For example, for \textit{CWLProv} a workflow may be executed by an engine that does not know where its workflow provenance will be stored, published or finally integrated. To this end \textit{CWLProv} generators should use the proposed \emph{arcp} \citep{soilandreyes_2018} URI scheme to map local file paths within the RO BagIt folder structure to absolute URIs for use within the RO manifest and associated PROV traces. Consumers of \textit{CWLProv} ROs that do not contain an arcp-based External-Identifier should generate a temporary arcp base to safely resolve any relative URI references not present in the \textit{CWLProv} folder. Implementations processing a \textit{CWLProv} RO may convert arcp URIs to local \texttt{file:///} or \texttt{http://} URIs depending on how and where the \textit{CWLProv} RO was saved, e.g. using the ``arcp.py'' library \citep{arcp_ro2018}.
\subsection{Retrospective Provenance Profile}\label{sec:provenanceprofile}
\begin{table*}[!htbp]
\caption{Fulfilling recommendations with the \textit{CWLProv} profile of W3C PROV, extended with Research Object Model's \textit{wfdesc} (prospective provenance) and \textit{wfprov} (retrospective provenance). }\label{tab:provProfile}
\begin{tabularx}{\linewidth}{l l l l L}
\toprule
PROV type & Subtype & Relation & Range & Recommendation \\
\midrule
\textbf{Plan} & wfdesc:Workflow & wfdesc:hasSubProcess & wfdesc:Process & R9-workflow\\
& wfdesc:Process & & \\
\textbf{Activity} & wfprov:WorkflowRun & wasAssociatedWith &
wfprov:WorkflowEngine & R8-environment\\
& & \nary{hadPlan} & ~ wfdesc:Workflow & R9-workflow, R17-executable \\
& & wasStartedBy & wfprov:WorkflowEngine & R8-environment \\
& & \nary{atTime} & ~ \placeholder{ISO8601 timestamp} & R13-provenance \\
& & wasStartedBy & wfprov:WorkflowRun & R9-workflow \\
& & wasEndedBy & wfprov:WorkflowEngine & R8-environment \\
& & \nary{atTime} & ~ \placeholder{ISO8601 timestamp} & R13-provenance \\
& wfprov:ProcessRun & wasStartedBy & wfprov:WorkflowRun & R10-software \\
& & \nary{atTime} & ~ \placeholder{ISO8601 timestamp} & R14-provenance\\
& & used & wfprov:Artifact & R11-raw-data \\
& & \nary{role} & ~ wfdesc:InputParameter & R1-parameters \\
& & wasAssociatedWith & wfprov:WorkflowRun & R9-workflow \\
& & \nary{hadPlan} & ~ wfdesc:Process & R17-executable, R16-format \\
& & wasEndedBy & wfprov:WorkflowRun & R13-provenance \\
& & \nary{atTime} & ~ \placeholder{ISO8601 timestamp} & R13-provenance \\
& SoftwareAgent & wasAssociatedWith & wfprov:ProcessRun & R8-environment
\\
& & \nary{cwlprov:image} & ~ \placeholder{docker image id} & R4-sw-version \\
\textbf{SoftwareAgent} & wfprov:WorkFlowEngine & wasStartedBy & Person \placeholder{ORCID} & R12-attribution \\
& & label & \placeholder{cwltool \texttt{-{}-}version} & R4-sw-version \\
\textbf{Entity} & wfprov:Artefact & wasGeneratedBy & wfprov:Processrun & R3-intermediate, R7-identifier \\
& & \nary{role} & ~ wfdesc:OutputParameter & R1-parameters \\
\textbf{Collection} & wfprov:Artefact & hadMember & wfprov:Artefact & R3-intermediate\\
& Dictionary & hadDictionaryMember & wfprov:Artefact & \\
& & \nary{pairKey} & ~ \placeholder{filename} & R7-identifier \\
\bottomrule
\end{tabularx}
\begin{tablenotes}
\item Indentation with \return indicates n-ary relationships which are expressed differently depending on PROV syntax.
Namespaces:
\url{http://www.w3.org/ns/prov#} (default),
\url{http://purl.org/wf4ever/wfdesc#} (\textit{wfdesc}),
\url{http://purl.org/wf4ever/wfprov#} (\textit{wfprov}),
\url{https://w3id.org/cwl/prov#} \textit{cwlprov})
\end{tablenotes}
\end{table*}
As stated earlier, the primary provenance file should conform to the PROV-N \citep{moreau_2013} serialisation of PROV data model, and may optionally use ontologies specific to the workflow execution. The key features used in the structure of the retrospective provenance profile for a CWL workflow enactment in \textit{CWLProv} are listed in Table \ref{tab:provProfile}). These features are not tied to any platform or workflow definition approach and hence can be used to document retrospective provenance of any workflow irrespective of the workflow definition approach.
The core mapping is following the PROV data model as in Figure \ref{fig:prov-dm}): The PROV \emph{Activity} represent the duration of a workflow run, as well as individual step executions, which \emph{used} file and data (\emph{Entity}), which again may be \emph{wasGeneratedBy} previous step activities. The workflow engine (e.g. cwltool) is the \emph{Agent} controlling these activities according to the workflow definition (\emph{Plan}).
PROV is a general standard not specific to workflows, and lacks features to relate a \emph{plan} (i.e. a workflow description) with sub-plans and workflow-centric retrospective provenance elements e.g. specific workflow enactment and its related steps enactment. We have utilized \textit{wfdesc} and \textit{wfprov} to represent few elements of prospective and retrospective provenance respectively. In addition, the provenance profile documented details of all the uniquely identified \textit{activities} e.g. workflow enactment and related command line tool invocations, their associated \textit{entities} (e.g. input and output data artefacts, input configuration files, workflows and command line tool specifications). The profile also documents the relationship between activities such as which activity (workflow enactment) was responsible for starting and ending another activity (command line tool invocation).
As described in Section \textbf{\nameref{sec:levels}}, in order to achieve maximum \textit{white-box} provenance, the inner workings of a nested workflow should also be included in the provenance trace. If a step represents a nested workflow, a separate provenance profile is included in the RO. Moreover, in the parent workflow trace, this relationship is recorded using \textit{has\_provenance} as an attribute of the \textit{Activity} step which refers to the profile of the nested workflow.
\section{Practical Realisation of \textit{CWLProv}} \label{sec:demo}
\textit{CWLProv} \citep{cwlprov} provides a format that can be adopted by any workflow executor or platform, provided that the underlying workflow definition approach is at least as declarative as CWL, i.e. it captures the necessary components described in Section \textbf{\nameref{sec:standards}}. In the case of CWL, as long as the conceptual constructs are common amongst the available implementations and executors, a workflow enactment can be represented in \textit{CWLProv} format. To demonstrate the practical realisation of the proposed model we consider a Python-based reference implementation of CWL \textit{cwltool}.
\textit{cwltool} is a feature complete reference implementation of CWL. It provides extensive validation of CWL files as well as offering a comprehensive set of test cases to validate new modules introduced as extensions to the existing implementation. Thus it provides the ideal choice for implementing \textit{CWLProv} for provenance support and resource aggregation. The existing classes and methods of the implementation were utilized to achieve various tasks such as packaging of the workflow and all associated tool specifications together. In addition, the existing python library \textit{prov} \citep{provpython} was used to create a provenance document instance and populate it with the required artefacts generated as the workflow enactment proceeds.
It should be noted that we elected to implement \textit{CWLProv} in the reference implementation \textit{cwltool} instead of the more scalable and production-friendly CWL implementations like Toil \citep{vivian2017toil}, Arvados \citep{arvados}, Rabix \citep{kaushik_2017}, CWL-Airflow \citep{cwlairflow2018} or Cromwell \citep{cromwell}. An updated list of implementations is available at the CWL homepage \footnote{\url{https://www.commonwl.org/\#Implementations}}. Compared to \textit{cwltool} these generally have extensive scheduler and cloud compute support, and extensions for large data transfer and storage, and should therefore be considered for any adopters of the Common Workflow Language. In this study we have however focused on \textit{cwltool} as its code base was found to be easy to adapt for rich provenance capture without having to modify subsystems for distributed execution or data management, and as a reference implementation better informing us on how to model \textit{CWLProv} for the general case rather than being tied into execution details of the more sophisticated CWL workflow engines.
\textit{CWLProv} support for \textit{cwltool} is built as an optional module which when invoked as \textit{``cwltool \texttt{-{}-}provenance ro/ workflow.cwl job.json''}, will automatically generate an RO with the given folder name \textit{ro/} without requiring any additional information from the user. Each input file is assigned a hash value and placed in the folder \textit{ro/data}, making it content-addressable to avoid local dependencies (Figure \ref{fig:processflow}).
\begin{figure*} [t!]
\includegraphics[width= 0.8\textwidth]{images/ProvenanceProcessFlow.png}
\centering
\captionsetup{justification=centering,margin=2cm}
\caption{High level process flow representation of retrospective provenance capture }\label{fig:processflow}
\end{figure*}
In order to avoid including information about attribution without consent of the user, we introduce an additional flag \textit{`` \texttt{-{}-}enable-user-provenance''}. If a user provides the options \textit{ \texttt{-{}-}orcid} and \textit{ \texttt{-{}-}full-name}, this information will be included in the provenance profile related to user attribution. Enabling \textit{`` \texttt{-{}-}enable-user-provenance''} and not providing the full name or ORCID will store user account details from the local machine for attribution, i.e. the details of the \textit{agent} that enacted the workflow.
The workflow and command line tool specifications are aggregated in one file to create an executable workflow and placed in folder \textit{ro/workflow}. This folder also contains transformed input job objects containing the input parameters with references to artefacts in the \textit{ro/data} based on relativising the paths present in the input object. These two files are sufficient to re-enact the workflow, provided the other required artefacts are also included in the RO and comply to the \textit{CWLProv} format. The \textit{cwltool} control flow \citep{cwltool-controlflow} indicates the points when the execution of the workflow and command line tools involved in the workflow enactment start, end and how the output is reported back. This information and the artefacts are captured and stored in the RO.
When the execution of a workflow begins, \textit{CWLProv} extensions to \textit{cwltool} generate a provenance document (using the \textit{prov} library) which includes default namespaces for the workflow enactment \textit{“activity”}. The attribution details as an \textit{agent} are also added at this stage if user provenance capture is enabled, e.g. to answer ``who ran the workflow?''. Each step of the workflow can correspond to either a command line tool or another nested workflow referred to as a \textit{sub-workflow} in the CWL documentation. For each nested workflow, a separate provenance profile is initialized recursively to achieve a \textit{white-box} finer-grained provenance view as explained in Section \textbf{\nameref{sec:levels}}. This profile is continually updated throughout the nested workflow enactment. Each step is identified by a unique identifier and recorded as an \textit{activity} in the parent workflow provenance profile, i.e. the \textit{``primary profile''}. The \textit{nested} workflow is recorded as a step in the \textit{primary profile} using the same identifier as the ``nested workflow enactment activity'' identifier in the respective provenance profile. For each step in the activity, the start time and association with the workflow activity is created and stored as part of the overall provenance to answer the question ``when did it happen?''.
The data used as input by these steps is either provided by the user or produced as an intermediate result from the previous steps. In both cases, the \textit{Usage} is recorded in the respective provenance profile using checksums as identifiers to answer the question ``what was used?''. The non-file input parameters such as strings and integers are stored ``as-is'' using an additional optional argument, \textit{prov:value}. Upon completion, each step typically generates some data. The provenance profile records the generation of outputs at the step level to record ``what was produced?'' and ``which process produced it?''. Once all steps complete, the workflow outputs are collected and the generation of these outputs at the workflow level are recorded in the provenance profile. Moreover, using the checksum of these files generated by the \textit{cwltool}, content-addressable copies are saved in the folder \textit{ro/data}. The provenance profile refers to these files using the same checksum such that they are traceable or can be used for further analysis if required. The workflow specification, command line tool specifications and JSON job file is archived in the \textit{ro/snapshot} folder to preserve the actual workflow history.
This prototype implementation provides a model and guidance for workflow platforms and executors to identify their respective features that can be utilized in devising their own implementation of \textit{CWLProv}.
\subsection{Achieving recommendations with provenance levels}
Table \ref{tab:fulfilling} map the best practices and recommendations from Table \ref{tab:recommendation:wide} to the Levels of Provenance (Figure \ref{fig:levels}). The shown methods and implementation readiness indicate to which extent the recommendations are addressed by the implementation of \textit{CWLProv} (detailed in this section).
Note that other approaches may solve this mapping differently. For instance, Nextflow \citep{ditommaso_2017} may fulfill \textit{R18-resource-use} at Provenance \nameref{sec:level2} as it can produce trace reports with hardware resource usage per task execution \citep{nextflow_tracing}, but not for the overall workflow. While a Nextflow trace report is a separate CSV file with implementation-specific columns, our planned \textit{R18-resource-use} approach for CWL is to combine \textit{CWL-metrics} \citep{tazro2018}, permalinks and the standard \textit{GFD.204} \citep{cristofori2013usage} to further relate resource use with \nameref{sec:level1} and \nameref{sec:level2} provenance within the \textit{CWLProv} Research Object.
In addition to following the recommendations from Table \ref{tab:recommendation:wide} through computational methods, the workflow authors are also required to exercise \textit{best practices for workflow design and authoring}. For instance, to achieve \textit{R1-parameters} the workflow must be written in such a way that parameters are exposed and documented at workflow level, rather than hard-coded within an underlying Python script. Similarly, while the CWL format support rich details of user annotations that can fulfill \textit{R6-annotation}, for these to survive into a Research Object at execution time, such annotation capabilities must actually be used by workflow authors instead of unstructured text files.
It should be a goal of a scientific WMS to guide users towards achieving the required level of the provenance framework through automation where possible. For instance a user may in the workflow have specified a Docker container image without preserving the version, but the provenance log could still record the specific container version used at execution time, achieving \textit{R4-sw-version} retrospectively by computation rather than relying on a prospective declaration in the workflow definition.
\begin{table}[bt!]
\caption{Recommendations and provenance levels implemented in \textit{CWLProv}} \label{tab:fulfilling}
\begin{tabular}{l c c c c l}
\toprule
Recommendation & L0 & L1 & L2 & L3 & Methods \\
\midrule
R1-parameters & $\bullet$ && $\bullet$ && CWL, BP \\
R2-automate & $\bullet$ &&&& CWL, Docker \\
R3-intermediate && $\bullet$ &&& PROV, RO \\
R4-sw-version & $\bullet$ && $\bullet$ && CWL, Docker, PROV \\
R5-data-version & $\bullet$ &&& $\bullet$ & CWL, BP\\
R6-annotation && $\bullet$ && $\coasterisk$ & CWL, RO, BP \\
R7-described && $\bullet$ &&& CWL, RO\\
R7-identifier && $\bullet$ & $\bullet$ & $\bullet$ & RO, CWLProv\\
R8-environment && $\coasterisk$& $\coasterisk$ && GFD.204 \\
R9-workflow & $\bullet$ & $\bullet$ & $\bullet$ && CWL, wfdesc \\
R10-software & $\bullet$ && $\bullet$ && CWL, Docker \\
R11-raw-data & $\bullet$ & $\bullet$ &&& CWLProv, BP \\
R12-attribution & & $\bullet$ &&& RO, CWL, BP \\
R13-provenance & & $\bullet$ & $\bullet$ && PROV, RO \\
R14-diagram & $\circ$ &&& $\coasterisk$ & CWL, RO \\
R15-open-source & $\bullet$ &&&& CWL, BP \\
R16-format & & $\bullet$ && $\bullet$ & CWL, BP \\
R17-executable & $\circ$ & $\bullet$ &&& CWL, Docker \\
R18-resource-use & & $\coasterisk$ & $\coasterisk$ && CWL, GFD.204 \\
R19-example & $\coasterisk$ & $\circ$ &&& RO, BP \\
\bottomrule
\end{tabular}
\begin{tablenotes}
\item \textbf{CWL}: Common Workflow Language and embedded annotations
\item \textbf{RO}: Research Object model and BagIt
\item \textbf{PROV}: W3C Provenance model
\item \textbf{CWLProv}: Additional attributes in PROV
\item \textbf{wfdesc}: Prospective provenance in PROV
\item \textbf{BP}: Best Practice need to be followed manually
\item $\bullet$ Implemented
\item $\circ$ Partially implemented
\item $\coasterisk$ Implementation planned/ongoing
\end{tablenotes}
\end{table}
\section{CWLProv Evaluation with Bioinformatics Workflows} \label{sec:evaluation}
\textit{CWLProv} as a standard supports \textit{syntactic}, \textit{semantic} and \textit{pragmatic} interoperability (defined in Section \textbf{\nameref{sec:interoperability}}) of a given workflow and its associated results. We have defined a \textit{``common data format''} for workflow sharing and publication such that any executor or WMS with CWL support can interpret this information and make use of it. This ensures the \textit{syntactic} interoperability between the workflow executors on different computing platforms. Similarly the \textit{``content''} of the shared aggregation artefact as a workflow-centric RO is unambiguously defined, thus ensuring uniform representation of the workflow and its associated results across different platforms and executors hence supporting \textit{semantic} interoperability. With \textit{Level 3} provenance satisfied providing domain-specific information along with level 0-2 provenance tracking, we posit that \textit{CWLProv} would be able to accomplish \textit{pragmatic} interoperability by providing unambiguous information about the \textit{``context''}, \textit{``application''} and \textit{``use''} of the shared/published workflow-centric ROs. Hence, extension of the current implementation (described in section \ref{sec:demo}) in future to include domain-rich information in the provenance traces and the \textit{CWLProv RO} will result in pragmatic interoperability.
To demonstrate the interoperability and portability of the proposed solution, we evaluate \textit{CWLProv} and its reference implementation using open source bioinformatics workflows available on GitHub from different research initiatives and from different developers. Conceptually, these workflows are selected for evaluation due to their excessive use in real-life data analyses and variety of the input data. Alignment workflow is included in the evaluation as it is one of the most time consuming yet mandatory steps in any variant calling workflow. Practically, choosing the workflows by these particular groups out of numerous existing implementations is justified in each section below.
\subsection{RNA-seq Analysis Workflow} \label{sec:rnaseq-wf}
\begin{figure*} [t!]
\centering
\includegraphics[width=\textwidth]{images/rnaseq-cwlviewer-half.png}
\captionsetup{justification=centering}
\caption{Portion of a RNA-seq workflow generated by CWL viewer \citep{robinson_2017}.}\label{fig:rna-seq}
\end{figure*}
RNA sequencing (RNA-seq) data generated by Next Generation Sequencing (NGS) platforms is comprised of short sequence reads that can be aligned to a reference genome, where the alignment results form the basis of various analyses such as quantitating transcript expression; identifying novel splice junctions and isoforms and differential gene expression \citep{dobin2015mapping}. RNA-seq experiments can link phenotype to gene expression and are widely applied in multi-centric cancer studies \citep{cohen2017scientific}. Computational analysis of RNA-seq data is performed by different techniques depending on the research goals and the organism under study \citep{Conesa2016}. The workflow \citep{rnaseq} included in this case study has been defined in CWL by one of the teams \citep{heliumda} participating in NIH Data Commons initiative \citep{DataComm86}, a large research infrastructure program aiming to make digital objects (such as data generated during biomedical research and software/tools required to utilize such data) shareable and accessible and hence aligned with the FAIR principles \citep{wilkinson_2016}.
This workflow (Figure \ref{fig:rna-seq}), designed for the pilot phase of the NIH Data Commons initiative \citep{NIH-PILOT}, adapts the approach and parameter settings of Trans-Omics for precision Medicine (TOPMed) \citep{TransOmi24}. The RNA-seq pipeline originated from the Broad Institute \citep{gtexpipe}. There are in total five steps in the workflow starting from:
1) Read alignment using STAR \citep{Dobin2012} which produces aligned BAM files including the Genome BAM and Transcriptome BAM.
2) The Genome BAM file is processed using Picard MarkDuplicates \citep{MarkDuplicates} producing an updated BAM file containing information on duplicate reads (such reads can indicate biased interpretation).
3) SAMtools index \citep{li2009sequence} is then employed to generate an index for the BAM file, in preparation for the next step.
4) The indexed BAM file is processed further with RNA-SeQC \citep{DeLuca2012} which takes the BAM file, human genome reference sequence and Gene Transfer Format (GTF) file as inputs to generate transcriptome-level expression quantifications and standard quality control metrics.
5) In parallel with transcript quantification, isoform expression levels are quantified by RSEM \citep{li2011rsem}. This step depends only on the output of the STAR tool, and additional RSEM reference sequences.
For testing and analysis, the workflow author provided example data created by down-sampling the read files of a TOPMed public access data \citep{seo2012transcriptional}. Chromosome 12 was extracted from the \textit{Homo Sapien Assembly 38} reference sequence and provided by the workflow authors. The required GTF and RSEM reference data files are also provided. The workflow is well-documented with a detailed set of instructions of the steps performed to down-sample the data are also provided for transparency. The availability of example input data, use of containerization for underlying software and detailed documentation are important factors in choosing this specific CWL workflow for \textit{CWLProv} evaluation.
\subsection{Alignment Workflow} \label{sec:align}
Alignment is an essential step in variant discovery workflows and considered an obligatory \textit{pre-processing} stage according to Best Practices by the Broad Institute \citep{GATKBP}. The purpose of this stage is to filter low-quality reads before variant calling or other interpretative steps \citep{xu2018review}. The workflow for alignment is designed to operate on raw sequence data to produce analysis-ready BAM files as the final output. The typical steps followed include file format conversions, aligning the read files to the reference genome sequence, and sorting the resulting files.
\begin{figure*} [b!]
\centering
\includegraphics[width=0.95\textwidth]{images/new_alignment.png}
\captionsetup{justification=centering}
\caption{Alignment workflow representation generated by CWL viewer.}\label{fig:align}
\end{figure*}
The CWL alignment workflow \citep{alignment-wf} included in this evaluation (Figure \ref{fig:align}) is designed by Data Biosphere \citep{DataBios21}. It adapts the alignment pipeline \citep{docker-alignment} originally developed at Abecasis Lab, The University of Michigan \citep{Abecasis14}. This workflow is also part of NIH Data Commons initiative (as \nameref{sec:rnaseq-wf}) and comprises of four stages.
First step, ``Pre-align'' accepts a Compressed Alignment Map (CRAM) file (a compressed format for BAM files developed by European Bioinformatics Institute (EBI) \citep{Cochrane2012}) and human genome reference sequence as input and using underlying software utilities of SAMtools such as view, sort and fixmate returns a list of fastq files which can be used as input for the next step. The next step ``Align'' also accepts the human reference genome as input along with the output files from ``Pre-align'' and uses BWA-mem \citep{li2013aligning} to generate aligned reads as BAM files. SAMBLASTER \citep{samblaster} is used to mark duplicate reads and SAMtools view to convert read files from SAM to BAM format. The BAM files generated after ``Align'' are sorted with ``SAMtool sort''. Finally these sorted alignment files are merged to produce single sorted BAM file using SAMtools merge in ``Post-align'' step. The authors provide an example CRAM file, \textit{Homo Sapien Assembly 38} reference genome along with its index files to be used as inputs for testing and analysis of the workflow.
\subsection{Somatic Variant Calling Workflow}
Variant discovery analysis for high-throughput sequencing data is a widely used bioinformatics technique, focused on finding genetic associations with diseases, identifying somatic mutations in cancer and characterizing heterogeneous cell populations \citep{variantcallinglecture}. The \textit{pre-processing} explained for the Alignment workflow is part of any variant calling workflow as reads are classified and ordered as part of the variant discovery process. Numerous variant calling algorithms have been developed depending on the input data characteristics and the specific application area \citep{xu2018review}. Somatic variant calling workflows are designed to identify somatic (non-inherited) variants in a sample - generally a cancer sample - by comparing the set of variants present in a sequenced tumour genome to a non-tumour genome from the same host \citep{Saunders2012}. The set of tumour variants is a super-set of the set of host variants, and somatic mutations can be identified through various algorithmic approaches to subtracting host familial variants. Each somatic variant calling workflow typically consists of three stages: pre-processing; variant evaluation and post-filtering.
The somatic variant calling workflow (Figure \ref{fig:somatic}) included in this case study is designed by Blue Collar Bioinformatics (bcbio) \citep{bcbio}, a community-driven initiative to develop best-practice pipelines for variant calling, RNA-seq and small RNA analysis workflows. According to the documentation, the goal of this project is to facilitate the automated analysis of high throughput data by making the resources \textit{quantifiable}, \textit{analyzable}, \textit{scalable}, \textit{accessible} and \textit{reproducible}. All the underlying tools are containerized facilitating software use in the workflow. The somatic variant calling workflow defined in CWL is available on GitHub \citep{bcbiowf} and equipped with a well defined test dataset.
\begin{figure*} [t!] %% preferably at bottom or top of column
\centering
\includegraphics[width=.65\textwidth, height=120mm]{images/2variantcalling.png}
\captionsetup{justification=centering}
\caption{Visual representation of the bcbio somatic variant calling workflow (Adapted from \citep{bcbiocwl}) and the subworkflow images are generated by CWL viewer.}\label{fig:somatic}
\end{figure*}
\subsection{Evaluation Activity} \label{eval-activity}
This section describes the evaluation of cross-executor and cross-platform interoperability of \textit{CWLProv}. To test cross-executor interoperability, two CWL executors \textit{cwltool} and \textit{toil-cwl-runner} were selected. \textit{toil-cwl-runner} is an open source Python workflow engine supporting robust cross-platform workflow execution on Cloud and High Performance Computing (HPC) environments \citep{vivian2017toil}. The two operating system platforms utilized in this analysis were MacOS and Ubuntu Linux. For the Linux OS, a 16-core Linux instance with 64GB RAM was launched on the Australian National eResearch Collaboration Tools and Resources (NeCTAR) research cloud \citep{Nectar}. To cater for the storage requirements, a 1000GB persistent volume was attached to this instance. For MacOS, a local system with 16GB RAM, 250GB storage and 2.8 GHz Intel Core i7 processor was used. These platforms were selected to cater for the required storage and compute resources of the workflows described above. The reference genome provided with \nameref{sec:align} was not down-sampled and hence this workflow required most resources among the three evaluated.
It is worth mentioning that this evaluation does not include details of the installation process for \textit{cwltool}, \textit{toil-cwl-runner} and \textit{Docker} on systems described above. To create \textit{CWLProv} ROs during workflow execution, it is necessary to use the CWL reference runner (\textit{cwltool}) until this practice spreads to other CWL implementations. Moreover, it is assumed that the software container (Docker) should also be installed on the system to use the workflow definitions aggregated in a given \textit{CWLProv} RO.
In addition, the resource requirements (identified in \textit{R18-resource-use} and discussed in Section \textbf{\nameref{sec:discussion}}) should also be satisfied by choosing a system with enough compute and storage resources for successful enactment. The systems used in this case study should be a reference when selecting a system as inadequate compute and storage resources such as insufficient RAM or number of cores will hinder the successful re-enactment of workflows using these ROs. The hardware requirements may also vary if a different dataset is used as input to re-enact the workflow using the methods aggregated in the RO. In that case, the end user must ensure availability of adequate compute and storage resources by choosing a system that meets the required specifications \citep{kanwal2017digital}.
Since the \textit{CWLProv} implementation is demonstrated for one of the executors (\textit{cwltool}), currently a \textit{CWLProv} RO for any workflow can only be produced using \textit{cwltool}. Hence, in this activity the workflows are initially enacted using just \textit{cwltool} (Table \ref{tab:eval}). The outline of the steps performed to analyse \textit{CWLProv} for each case study is as follows.
\begin{enumerate}[label=\Roman*)]
\item The workflow was enacted using \textit{cwltool} to produce a RO on a MacOS computer.
\begin{enumerate} [label=\arabic*)]
\item The resulting RO and aggregated resources were used to re-enact the workflow using \textit{toil-cwl-runner} on the same MacOS computer;
\item The RO produced in step I was transferred to the cloud-based Linux instance used in this activity;
\item On the cloud-based Linux environment and only utilizing the resources aggregated in the RO, the workflow was re-enacted using \textit{cwltool} and \textit{toil-cwl-runner}.
\end{enumerate}
\item The workflow was enacted using \textit{cwltool} to produce a RO on Linux.
\begin{enumerate} [label=\arabic*)]
\item The resulting RO and aggregated resource were utilized to re-enact the workflow using \textit{toil-cwl-runner} on the same cloud-based Linux instance;
\item The RO produced in step II was transferred to the MacOS computer used in this activity;
\item On the MacOS computer and only utilizing the resources aggregated in the RO, the workflow was re-enacted using \textit{cwltool} and \textit{toil-cwl-runner}.
\end{enumerate}
\end{enumerate}
The \textit{CWLProv} ROs produced as a results of this activity are published on Mendeley Data \citep{rnaseq_mendeley, alignment_mendeley, somatic_mendeley} with mirrors on Zenodo.
\subsection{Evaluation Results} \label{sec:eval-results}
The steps described above were taken to produce ROs which were then used to re-enact the workflows (outlined in Table \ref{tab:eval}), without any further changes required. This demonstration illustrated the syntactic and semantic interoperability of the workflows across different systems. It shows that \textbf{both CWL executors were able to \textit{exchange}, \textit{comprehend} and \textit{use} the information represented as \textit{CWLProv} ROs}. The current implementation described in section \textbf{\nameref{sec:demo}} does not resolve \textit{Level 3}. Hence, the inclusion of domain-specific annotations referring to scientific context to address pragmatic interoperability is identified as crucial future direction and further detailed in section \textbf{\nameref{sec:discussion}}.
\begin{table}[!htbp]
\caption {\textit{CWLProv} evaluation summary and status for the 3 bioinformatics case studies. }\label{tab:eval}
\begin{tabularx}{\linewidth}{c c c}
\toprule
Enact-produce RO with & Re-enact using RO with & Status \\
\midrule
\textit{cwltool} on MacOS & \textit{toil-cwl-runner} on MacOS & \checkmark \\
& \textit{cwltool} on Linux & \checkmark \\
& \textit{toil-cwl-runner} on Linux & \checkmark \\
\midrule
\textit{cwltool} on Linux & \textit{toil-cwl-runner} on Linux & \checkmark \\
& \textit{cwltool} on MacOS & \checkmark \\
& \textit{toil-cwl-runner} on MacOS & \checkmark \\
\bottomrule
\end{tabularx}
\end{table}
\subsubsection{\textcolor{black}\textit{CWLProv} and Interoperability}
CWL already builds on technologies such as JavaScript Object Notation for Linked Data (JSON-LD) \citep{JSONLD} for data modeling and Docker \citep{docker} to support portability of the run-time environments. The portability and interoperability as basic principles of the underlying workflow definition approach for any workflow-centric analysis implies that the analysis should also be portable and interoperable. However, the workflow definition/specification alone is insufficient when dealing with commandline tool specifications, data, and input configuration files used in the analysis if these are not readily available.
\textit{CWLProv} ensures availability of these resources for a given analysis conforming to the framework defined in Section \textbf{\nameref{sec:CWLProv}}. The input configurations are saved as \textit{primary-job.json} in folder \textit{workflow/} and refer to the input data contained in the payload \textit{data/} folder of the given RO. In this way, availability of data aggregated with the analysis is made possible. Existing features of \textit{cwltool} are used to generate the CWL workflow specification file containing all of the commandline tool specifications referred to in the workflow specification and placed in the same \textit{workflow/} folder.
One might argue that copying a folder tree might serve the same purpose but in that case we again will be relying on users to put substantial amount of effort on top of the actual analysis, i.e. they would have to carefully structure their directories to be aligned with the workflow creators. Instead CWL encourages researchers to utilize container technologies such as Docker, Singularity, or software packaging systems like Debian (Med) or Bioconda to ensure availability of underlying tools as recommended by numerous studies \citep{belhajjame_2015, kanwal_2017, garijo_2017, stodden_2016, Stodden2014, Gruening2018}. This practice facilitates the preservation of methods utilized in data-intensive scientific workflows and enables verification of the published claims without requiring the end-user to do any manual installation and configuration. Examples of tools available via Docker containers used here are the alignment tool (BWA mem) used in the Alignment workflow and STAR aligner used in RNA-seq workflow.
\subsubsection{\textcolor{black}Evaluating Provenance Profile}
The retrospective provenance profile generated as part of \textit{CWLProv} for each workflow enactment can be examined and queried to extract the required subset of information. \textit{Provenance Analytics} is a separate domain and a next step after provenance collection in the provenance life cycle \citep{Missier2016}. Often provenance data is queried using specialized query languages such as SQL SPARQL or TriQL depending on the storage mechanism used. Query operations can combine information from prospective and retrospective provenance to understand computational experiments better.
The focus of this paper is not in-depth provenance analytics but we have demonstrated the application of the provenance profile generated as part of \textit{CWLProv}. We have developed a commandline tool and Python API \textit{``cwlprov-py''} \citep{cwlprov-py} for \textit{CWLProv} RO analytics to interpret the captured retrospective provenance of CWL workflow enactment. This API currently supports the following use-cases.
Given a \textit{CWLProv} RO:
\begin{itemize}
\item \textbf{Workflow Runs}\newline
As each RO can contain more than one \textit{workflow run} if sub-workflows are utilized to group related tasks into one workflow. In that case, the provenance traces are stored in separate files for each workflow run. \textit{cwlprov-py} identifies the workflow enactments including the sub-workflows \textit{(if any)} and returns the workflow identifiers annotated with the step names. The user can select the required trace and explore particular traces in detail.
\item \textbf{Attribution} \newline
Each RO is assumed to be associated with a single enactment of the primary workflow and hence assumed to be enacted by one person. As discussed previously, \textit{CWLProv} provides additional flags to enable user provenance capture. A user can provide their name and ORCID details that can be stored as part of a RO. \textit{cwlprov-py} displays attribution details of the researcher responsible for the enactment \textit{(if enabled)} and the versions of the workflow executor utilized in the analysis.
\item \textbf{Input/Output of a Process} \newline
Provenance traces contain associations between the steps/workflows with the data they used or generated. A user interested in a particular step can identify the inputs used and outputs produced linked explicitly to that process using \textit{cwlprov-py}. This option works using individual step identifiers (level 1) as well as nested workflows (level 2), facilitating re-use of intermediate data even if the original workflow author did not explicitly expose these as workflow outputs.
\item \textbf{Partial Re-runs} \newline
Re-running or re-using only desired parts of a given workflow has been emphasized \citep{cohen2017scientific} as important to evaluate the workflow process or validate the published results associated without necessarily re-enacting the workflow as a whole. \textit{cwlprov-py} uses the identifier of the step/workflow to be re-run, parses the provenance trace to identify the inputs required and ultimately creates a JSON input object with the associated input parameters. This input object can then be used for partial re-runs of the desired step/workflow, making segmented analysis possible even for CWLProv consumers who don't have sufficient hardware resources for re-executing more computationally heavy steps.
\end{itemize}
While the above explores some use cases for consuming and re-using workflow execution data, we have not explored this in full detail. Further work could develop more specific user scenarios and perform usability testing with independent domain-experts who have not seen the executed workflow before.
An important point of \textit{CWLProv} is to capture sufficient information at workflow execution time, so that post-processing (potentially by a third-party) can support unforeseen queries without requiring instrumentation at workflow design time. For instance, \texttt{cwlprov runtimes} calculates average runtime per step (requiring capture of start/stop time of each step iteration), while \texttt{cwlprov derived} calculates derivation paths back to input data (requiring consistent identifiers during execution). Further work could build a more researcher-oriented interface based on this approach, e.g. hardcoded data exploration for a particular workflow.
\subsubsection{\textcolor{black}Temporal and Spatial Overhead with Provenance}
\begin{table*}[t!]
\caption{Run-time comparison for the workflow enactments done cross-executor and cross-platform}\label{tab:time}
\captionsetup{width=\linewidth}
\centering
\begin{tabularx}{\linewidth}{|L | c | c | c | c | c | c |}
\toprule
\textbf{Workflow} & \multicolumn{3}{C}{\textbf{Linux}} & \multicolumn{3}{|C|}{\textbf{MacOS}} \\
\midrule
& \multicolumn{2}{|c|}{\textbf{cwltool}} & \textbf{toil-cwl-runner} & \multicolumn{2}{|c|}{\textbf{cwltool}} & \textbf{toil-cwl-runner} \\ \hline
& With Prov & W/O Prov & W/O Prov & With Prov & W/O Prov & W/O Prov \\ \hline
RNA-Seq Analysis Workflow & 4m30.289s & 4m0.139s & 3m46.817s & 3m33.306s & 3m41.166s & 3m30.406s \\ \hline
Alignment Workflow& 28m23.792s & 24m12.404s & 15m3.539s & -- & 162m35.111s & 146m27.592s \\ \hline
Somatic Variant Calling Workflow & 21m25.868s & 19m27.519s & 7m10.470s & 17m26.722s & 17m0.227s & ** \\
\hline\hline
\multicolumn{7}{l}{%
\begin{minipage}{\linewidth}%
\tiny ** This could not be tested because of a Docker mount issue on MacOS: \url{https://github.com/DataBiosphere/toil/issues/2680}%
\newline
\tiny -- This could not be tested because of the insufficient hardware resources on the MacOS test machine, hence step I of the evaluation activity could not be performed for this workflow%
\end{minipage}%
}\\
\bottomrule
\end{tabularx}
\end{table*}
Table \ref{tab:time} shows the run-times for the three workflow enactments using cwltool and toil-cwl-runner on Linux and MacOS with and without enabling provenance capture as described in the evaluation activity section. These workflows were enacted at least once before this time calculation, hence the timing does not include the time for Docker images to be downloaded. On a new system, when re-running these workflows for the first time, the Docker images will be downloaded and may take significantly longer than the time specified here especially in case of the Somatic Variant Calling workflow because of the image size.
Run-time and storage overheads are important for provenance-enabled computational experiments. The choice of different operating systems and provenance capture mechanisms such as operating-system level, application-level or workflow-level as well as I/O workload, interception mechanism and fine-grained information capture are key for provenance \citep{Carata2014, kim2016assessing}.
In our case study, significant time difference can be seen for the alignment workflow that used the most voluminous dataset, hence producing a sizable RO as well. This was due to the RO-generation where data was aggregated within the RO. The difference between the provenance-enabled enactment versus the enactment without provenance is barely noticeable for the other two workflow enactments with the smaller datasets. The discussion about handling the big `-omics' data such as human genome reference sequence, its index files and other database files (e.g. dbsnp) in Section \textbf{\nameref{sec:discussion}} provides a possible solution to avoid such overheads.
In addition, noticeable time difference between the cwltool and toil-cwl-runner enactments is because of the default parallel versus serial job execution in case of \textit{toil-cwl-runner} and \textit{cwltool} respectively. The ``scatter'' operation in CWL when applied to one or more input parameters of a workflow step or a sub-workflow, supports parallel execution of the associated processes. Parallelism is also available without “scatter” when separate processes have all their inputs ready. If sufficient compute resources are available, these jobs will be enacted concurrently otherwise they are queued for subsequent execution. Compute intensive steps of a workflow can benefit from scatter features for parallel execution by reducing the overall run-time. Both Alignment and Somatic Variant Calling workflows utilize the scatter feature to enable higher degrees of parallel job execution in case of \textit{toil-cwl-runner} which explains the time difference for the cross-executor of these two workflows. The difference is negligible for RNA-Seq workflow which is comprised of serial jobs with comparatively small test data.
\subsubsection{\textcolor{black}Output Comparison Across Enactments}
We compared the workflow outputs after each enactment to observe the concordance and/or discordance (if any) for the workflow enactment results produced across the platforms and across the executors. As \textit{CWLProv} RO refers to the data with hashed checksums, these checksums are utilized for the result comparison. It is worth-mentioning that the comparison was made between the output files generated by the different enactments against a single \textit{``truth-set''} output file available and checksum in the respective Git repositories.
The checksum of the output data generated cross-platform and cross-executor comparison data as a result of the initial enactments and re-runs using the CWL ROs to elicit the concordance in all but one cases. The ``correctness'' as well as agreement of these outputs given different execution environments (e.g. platform and executor) hold true except for Alignment workflow. Alignment workflow produced varying outputs after every execution even with the same executor and platform. The output of the alignment algorithm, ``BWA mem'' used in this workflow was non-deterministic as it depended on the \textit{number of threads -{}-t} and the \textit{seed length -{}-K} which affected the output produced. While the seed length in this case was set to a constant value, the number of threads varied depending on the availability of hardware resources at run-time, thereby resulting in varying output for the same input files.
\section{Discussion and Future Directions} \label{sec:discussion}
This section discusses the current and future work with reference to enriched provenance capture and smart resource aggregation, and enhancements to both the \textit{CWLProv} standard and implementation.
\subsubsection{\textcolor{black}Compute and Storage Resources}
The \textit{CWLProv} format encapsulates the data and workflow definitions involved in a given workflow enactment along with its retrospective provenance trace. CWL as a standard provides constructs to declare basic hardware resource requirements such as minimum and maximum cores, RAM and reserved file system storage required for a particular workflow enactment. The workflow authors can provide this information in the \textit{``requirements''} or \textit{``hints''} section as \textit{``ResourceRequirement''}. These requirements/hints can be declared at workflow or individual step level, to help platforms/executors to allocate the required resources. This information indirectly stores some aspects of prospective view of provenance with respect to hardware requirements of the underlying system used to enact a workflow. Currently this information is only available if declared as part of workflow specification. In future, we plan to include these requirements as part of provenance for a given workflow such that all such information is gathered in one space and users are not required to inspect multiple sources to extract this information. This information can then be used as a pre-condition for potential successful enactment of a given workflow.
As \textit{CWLProv} is focused on retrospective provenance capture of workflow enactment, we plan to include provenance information about the compute and storage resources utilized in a given enactment to fulfill \textit{R18-resource-use}. We believe that documenting these resources will allow users to analyse their environment and resource allocations before execution, as opposed to trial and error methods that may result in multiple failed enactments of a given workflow. Despite being an important factor, it is surprising to see that most of existing provenance standards lack dedicated constructs to represent the underlying hardware resource usage information as part of prospective or retrospective provenance. In the case of complex workflows using distributed resources, where each step could be executed on a different node/server, including all this information in a single \textit{PROV} profile will clutter the profile and render it potentially incomprehensible. Therefore, we plan to add a separate \textit{Usage Record} document in the RO conforming to GFD.204 \citep{cristofori2013usage} to describe \textit{Level 1} (and potentially \textit{Level 2}) resource usage in a common format independent on actual execution environment.
Capturing such resource usage records require a tighter integration with the execution platform, and so we consider this future work better suited for a cloud-based CWL engine like \textit{Toil} or \textit{Arvados}, as the reference implementation \textit{cwltool} does not exercise fine-grained control of its task execution. Detailed raw log files can also be provided as \textit{Level 0} provenance, as we have demonstrated with cwltool, but these will by their nature be custom per execution platform and thus should be considered unstructured. Related work that is already exploring this approach is \textit{cwl-metrics} \citep{tazro2018}, which analyses raw \textit{cwltool} log files in combination with detailed Docker invocation statistics using the container monitoring tool \textit{Telegraf}. Ongoing collaboration is exploring adding these metrics as additional provenance to the \textit{CWLProv} RO with summaries in PROV and GFD.204 formats.
\subsubsection{\textcolor{black}Provenance Profile Augmented with Domain Knowledge} \label{sec:domainknowledge}
\textit{CWLProv} benefits from existing best practices proposed by numerous studies (Table \ref{tab:recommendation:wide}) and includes defined standards for workflow representation, resource aggregation and provenance tracking (Section \textbf{\nameref{sec:standards}}). We posit that the principle of following well-defined data and metadata standards enables explicit data sharing and reuse. In order to include rich metadata for bioinformaticians to produce specialized ROs for bioinformatics to achieve \textit{CWLProv} \textit{Level 3} as defined in section \textbf{\nameref{sec:levels}}, we are investigating re-use of concepts from the BioCompute Object (BCO) project \citep{Alterovitz2019}. This domain-specific information is not necessary for computation and execution but for understandability of the shared resources. We encourage workflow authors to include such metadata and external identifiers for data and underlying tools, e.g. EDAM identifiers for the resources employed in designing a given workflow. The plan is to extract these annotations and represent in the retrospective provenance profile in \textit{CWLProv} to ultimately achieve pragmatic interoperability by providing domain-specific scientific context of the experiments. Domain-specific information is essential in determining the nature of inputs, outputs and context of the processes linked to a given workflow enactment \citep{Alper2018}. This information can be captured in the RO if and only if the workflow author adds it in the workflow definition, thus achieving \textit{CWLProv} \textit{Level 3} depends on the individual workflows.
\subsubsection{\textcolor{black}Big -omics Data}
While aggregating all resources as one download-able object improves reproducibility, the size of the resulting RO is an important factor in practice. On one hand, completeness of the resources contributes towards minimizing the \textit{workflow decay} phenomenon by least dependence on availability of third party resources. On the other hand, the nature of -omics data sizes can result in hard-to-manage workflow-centric ROs also leading to the spatial and temporal overheads as discussed in evaluation.
One solution is archiving the big datasets in online repositories or data stores and including the existing persistent identifiers and checksums in the RO instead of the actual data files, as previously demonstrated with BDBags \citep{chard_2016, madduri_2018}. While CWL executors like \textit{toil-cwl-runner} can be configured to deposit data in a shared repository, the \textit{cwltool} reference implementation explored in this study can only write to the local file system. External references raise the risk of unavailability of data at a later time. Therefore we recommend including the data in the RO if sufficient network and storage resources are available. Future work may explore post-processing \textit{CWLProv} ROs to replace large data files with references to stable data repositories, producing a slimmer RO for transfer where individual data items can be retrieved on demand, as well as reducing data duplication across multiple related ROs.
\subsubsection{\textcolor{black}Improving \textit{CWLProv} efficiency with selective provenance capture}
\textit{Shim} refers to an adaptor step to resolve a format incompatibility issues between two workflow tasks \citep{Mohan2014}, typically converting the previous output into an acceptable format for the next step. For example in our case study \textit{RNA-seq} workflow, \textit{RNA-SeQC} require an indexed BAM file, whereas the output of \textit{STAR} or \textit{Picard MarkDuplicates} only comprises of the BAM file alone. Hence, a shim step executing \textit{SAMtools index} make the aligned reads analysis ready for RNA-SeQC. Compared to the more analytical steps, the provenance of such shim steps are not particularly interesting for domain scientists, and in many cases their intermediate data would effectively double the storage cost with little information gains, as such data can be reliably recreated by re-applying the predictable transformation step (considering it as a \textit{pure function} without side-effects). Another type of ignorable steps could be purely diagnostic, which outputs are used primarily during workflow design to verify tool settings. A workflow engine does not necessarily know which steps are ``boring'' \footnote{The CWL 1.1 specification will add a hint \texttt{WorkReuse} for this purpose.} and our proof of concept implementation will dutifully store provenance from all steps.
To improve efficiency, future \textit{CWLProv} work could add options to ignore capturing outputs of specified \textit{shim} steps, or to not store files over a particular file size. Similarly a scientist or a WMS may elect to only capture provenance at a particular provenance level (see Section \textbf{\nameref{sec:levels}}).
Provenance captured under such settings would be ``incomplete'' (e.g. PROV would say \textit{RNA-SeQC} consumed an identified BAM index file, but the corresponding bytes would not be stored in the RO), thus it is envisioned this can be indicated in the RO manifest as a variant of the \textit{CWLProv} profile identifier to give the end-user clear indication of what to expect in terms of completeness, so that tools like \textit{cwlprov-py} could be extended to re-create missing outputs, verifying their expected checksums, or collapse provenance listing of ``boring'' steps to improve human tractability.
\subsubsection{\textcolor{black} Enforcement of Best Practices -- An Open Problem}
Recommendations and best practices from the scientific community are proposed frequently, to guide researchers to design their computational experiments in such a way as to make their research reproducible and verifiable. Not only the best practices for workflow design, but also for resource declaration, software packaging and configuration management are put forward \citep{Gruening2018} to avoid dependence on local installations and manual processes of dependency management. The term \textit{``Better Software, Better Research''} \citep{Goble2014} can also be well-applied on and adapted for the workflow design process.
Declarative approaches to workflow definition such as CWL facilitate and encourage users to explicitly declare everything in a workflow, improving white-box view of the retrospective as well as prospective provenance. Such workflows should provide insights of the complete process followed, to produce a data artefact resolving the black-boxness often associated with the workflow provenance. However, it is entirely up to researchers to leverage these approaches to produce well-defined workflows with explicit details facilitating enriched capture of the provenance trace at the appropriate level, and this can require considerable effort and consistency on the workflow designer's behalf. For instance, the alignment workflow used in this case study embeds bash scripts into the CWL tool definition, therefore requiring another layer needed to be penetrated for provenance information extraction. Despite using CWL for the workflow definition and \textit{CWLProv} for provenance capture, the provenance trace will be missing critical information making it coarse-grained, and the raw logs capturing the enactment will also not be as informative.
The three criteria defined by \citet{cohen2017scientific} to be followed by workflow designers are: modularized specifications, unified representation and workflow annotations. CWL facilitates a modular structure to workflow definitions by coupling similar steps to \textit{subworkflows}; and, as an interoperable standard, CWL provides a common platform moving towards resolution of the heterogeneity of the workflow specification languages. In addition, users can add standardised domain-specific annotations to data and workflows incorporating the constructs defined by external ontologies (e.g. EDAM) to enhance understanding of the shared specification and the resources it refers to. All these features can be utilized to design better workflows and maximize the information declaration resulting in semantically-rich and provenance-complete \textit{CWLProv} ROs, and should thus be expressed clearly in user guides\footnote{See for instance \url{https://view.commonwl.org/about\#format}} for workflow authors.
The usability of any \textit{CWLProv} RO directly relies on the choice of practices followed by the researchers to design and communicate their computational analyses. Workflow-centric initiatives similar to \textit{software carpentry} \citep{softwarecarpentry} and \textit{code is science} \citep{CodeIsSc} are one possible way to organize training and create awareness around best practices. Community-driven efforts to further consolidate the understanding of requirements to make a given workflow explicit and understandable should be made. Not only awareness about the workflow design is needed, but also the availability of the associated resources should be emphasized e.g. software as containers or software packages, big datasets in public repositories and pre-processing/post-processing as part of workflow. Without putting proposed best practices into actual practice, complete communication and hence the reproducibility of a workflow-centric computational analysis is likely to remain challenging.
\section{Conclusion} \label{sec:conclusion}
The comprehensive sharing and communication of the computational experiments employed to achieve a scientific objective establishes trust on published results. Shared resources are sometimes rendered ineffective due to incomplete provenance, heterogeneity of platforms, unavailability of software and limited access to data. To this context, the contributions of this study are four-fold. First, we have provided a comprehensive summary of the recommendations put forward by the community regarding workflow design and resource sharing. Second, we define a hierarchical provenance framework to achieve homogeneity in the granularity of the information shared with each level addressing specific provenance recommendations.
Third, we leverage the existing standards best suited to define a standardized format, \textit{CWLProv} for methodical representation of workflow enactments, its provenance and the associated artefacts employed. Finally, to demonstrate the applicability of \textit{CWLProv}, we extend an existing workflow executor (\textit{cwltool}) to provide a reference implementation to generate interoperable workflow-centric ROs, aggregating and preserving data and methods to support the coherent sharing of computational analyses and experiments.
With any published scientific research, statements such as \textit{``Methods and data are available upon request''} should no longer be acceptable in a modern open-science-driven research community. Considering on one hand the collaborative nature and emerging openness of bioinformatics research and on the other hand the heterogeneity of workflow design approaches, it is essential to provide open access to the structured representation of the data and methods utilized in any scientific study to achieve interoperable solutions facilitating reproducibility of science.
Provenance capture and its subsequent use to support published research transparency should not be treated as an after-thought but rather as a standard practice of up-most priority. With adoption of well-defined standards for provenance and declarative workflow definition approaches, the assumption of black-box provenance often associated with workflows can be addressed. The workflow authors should be encouraged to follow well-established and agreed upon best practices for workflow design and software environment deployment. In conclusion, we do not require new standards, new WMSs or indeed new best practices, instead the focus should be to implement, utilize and re-use existing mature community-driven initiatives to achieve consensus in representing different aspects of computational experiments.
\section{Availability of source code and requirements}
\textit{CWLProv} is implemented as part of the CWL reference implementation \textit{cwltool}:
\begin{itemize}
\item Project name: cwltool \rrid{RRID:SCR_015528}
\item Project home page: \newline \url{https://github.com/common-workflow-language/cwltool}
\item Version: \href{https://pypi.org/project/cwltool/1.0.20180809224403/}{1.0.20181012180214} \citep{cwltool}
\item Operating system(s): Platform independent
\item Programming language: Python 3.5 or later \rrid{RRID:SCR_008394}
\item Other requirements: \href{https://www.docker.com/}{Docker} \rrid{RRID:SCR_016445} recommended
\item License: \href{http://www.apache.org/licenses/LICENSE-2.0}{Apache License, Version 2.0}
\end{itemize}
The \textit{\textit{CWLProv} profile} documents the use of W3C PROV in a Research Object to capture a CWL workflow run:
\begin{itemize}
\item Project name: \textit{CWLProv} profile
\item Project home page: \url{https://w3id.org/cwl/prov}
\item Version: \href{https://w3id.org/cwl/prov/0.6.0}{0.6.0} \citep{cwlprov}
\item Operating system(s): Platform independent
\item License: \href{http://www.apache.org/licenses/LICENSE-2.0}{Apache License, Version 2.0}
\end{itemize}
The \textit{\textit{CWLProv} Python Tool} can be used to explore \textit{CWLProv} ROs on the command line:
\begin{itemize}
\item Project name: \textit{CWLProv} Python Tool (\textit{cwlprov-py})
\item Project home page: \newline \url{https://github.com/common-workflow-language/cwlprov-py}
\item Version: \href{https://pypi.org/project/cwlprov/0.1.1/}{0.1.1} \citep{cwlprov-py}
\item Operating system(s): Platform independent
\item Programming language: Python 3.5 or later \rrid{RRID:SCR_008394}
\item License: \href{http://www.apache.org/licenses/LICENSE-2.0}{Apache License, Version 2.0}
\end{itemize}
\section{Availability of supporting data and materials}
\textit{CWLProv} Research Objects of CWL workflow executions are published in Mendeley Data and mirrored to Zenodo.
\begin{itemize}
\item \begin{sloppypar}
CWL run of Somatic Variant Calling Workflow (CWLProv 0.5.0 Research Object)
\citep{somatic_mendeley} \\
\url{https://doi.org/10.17632/97hj93mkfd.3} \\
\url{https://zenodo.org/record/2841641}
\end{sloppypar}
\item \begin{sloppypar}
CWL run of Alignment Workflow (CWLProv 0.6.0 Research Object)
\citep{alignment_mendeley} \\
\url{https://doi.org/10.17632/6wtpgr3kbj.1} \\
\url{https://zenodo.org/record/2632836}
\end{sloppypar}
\item \begin{sloppypar}
CWL run of RNA-seq Analysis Workflow (CWLProv 0.5.0 Research Object)
\citep{rnaseq_mendeley} \\
\url{https://doi.org/10.17632/xnwncxpw42.1} \\
\url{https://zenodo.org/record/2838898}
\end{sloppypar}
\end{itemize}
The \textit{CWLProv Python Tool} can be used to explore the above research objects.
The data and methods supporting this work are also available in the GigaScience repository, GigaDB \citep{GigaScienceData}.
\section{Declarations}
\subsection{List of abbreviations}
BAM: Binary Alignment Map; BCO: BioCompute Object; CRAM: Compressed Alignment Map; CWL: Common Workflow Language; EBI: European Bionformatics Institute; GATK: Genome Analysis ToolKit; HPC: High Performance Computing; JSON-LD: JavaScript Object Notation for Linked Data; OS: Operating System; PROV-DM: PROVenance Data Model; RO: Research Object; W3C: World Wide Web Consortium; WMS: Workflow Management System;
\subsection{Ethical Approval (optional)}
Not applicable.
\subsection{Consent for publication}
Not applicable.
\subsection{Competing Interests}
SSR and MRC are members of the leadership team for Common Workflow Language at the Software Freedom Conservancy.
\subsection{Funding}
FZK funded by Melbourne International Research Scholarship (MIRS) and Melbourne International Fee Remission Scholarship (MIFRS).
SSR and CG funded by \href{https://www.bioexcel.eu}{BioExcel CoE}, a project funded by the European Commission
\href{http://dx.doi.org/10.13039/100010666}{Horizon 2020 Framework Programme} under
contracts \href{https://cordis.europa.eu/project/id/823830}{H2020-INFRAEDI-02-2018-823830} and
\href{http://cordis.europa.eu/projects/675728}{H2020-EINFRA-2015-1-675728}, as well as \href{https://www.ibisba.eu/}{IBISBA} (\href{http://cordis.europa.eu/projects/730976}{H2020-INFRAIA-1-2014-2015-730976}).
\subsection{Author's Contributions}
% CASRAI terms - see https://casrai.org/credit/
Conceptualization: FZK, SSR, MRC.
Data curation: FZK.
Formal analysis: FZK.
Funding acquisition: ROS, AL, CAG.
Investigation: FZK.
Methodology: FZK, SSR.
Project administration: FZK, SSR, ROS, AL
Computing Resources: ROS, AL.
Software: FZK, SSR, MRC.
Supervision: MRC, ROS, AL, CAG.
Validation: FZK, SSR.
Writing - original draft: FZK.
Writing - review \& editing: FZK, SSR, ROS, AL, MRC.
\section{Acknowledgements}
An earlier version of this article \citep{cwlprov-preprint} was submitted for consideration at International Provenance and Annotation Workshop (IPAW) 2018. We would like to thank the IPAW reviewers for their constructive comments.
We would also like to thank the GigaScience editors and reviewers Tomoya Tanjo and Alban Gaignard for constructive and valuable feedback we think has improved the manuscript and future directions.
We would like to thank the Common Workflow Language community, and in particular Peter Amstutz, Pau Ruiz Safont and Pjotr Prins, for their continuing support, review and feedback. We would also like to thank Brad Chapman, Christopher Ball and Lon Blauvelt for the workflows used in the evaluation and their prompt replies to our enquiries.
We are grateful for partial travel support from Open Bioinformatics Foundation (OBF) Travel Fellowship Program \citep{OBFTravel} to Farah Zaib Khan for attending the Bioinformatics Open Source Conference (BOSC) 2017 and 2018 Codefests subsidizing this collaborative effort.
\bibliography{paper-refs}
\end{document}