paper-refs.bib

@misc{cromwell,
  doi = {10.7490/f1000research.1114634.1},
  author = {Voss,  Kate and Auwera,  Geraldine Van Der and Gentry,  Jeff},
  title = {Full-stack genomics pipelining with GATK4 + WDL + Cromwell},
  publisher = {F1000Research},
  year = {2017}
}
@Article{debian-med,
author="M{\"o}ller, Steffen
and Prescott, Stuart W.
and Wirzenius, Lars
and Reinholdtsen, Petter
and Chapman, Brad
and Prins, Pjotr
and Soiland-Reyes, Stian
and Kl{\"o}tzl, Fabian
and Bagnacani, Andrea
and Kala{\v{s}}, Mat{\'u}{\v{s}}
and Tille, Andreas
and Crusoe, Michael R.",
title="Robust Cross-Platform Workflows: How Technical and Scientific Communities Collaborate to Develop, Test and Share Best Practices for Data Analysis",
journal="Data Science and Engineering",
year="2017",
month="Sep",
day="01",
volume="2",
number="3",
pages="232--244",
abstract="Information integration and workflow technologies for data analysis have always been major fields of investigation in bioinformatics. A range of popular workflow suites are available to support analyses in computational biology. Commercial providers tend to offer prepared applications remote to their clients. However, for most academic environments with local expertise, novel data collection techniques or novel data analysis, it is essential to have all the flexibility of open-source tools and open-source workflow descriptions. Workflows in data-driven science such as computational biology have considerably gained in complexity. New tools or new releases with additional features arrive at an enormous pace, and new reference data or concepts for quality control are emerging. A well-abstracted workflow and the exchange of the same across work groups have an enormous impact on the efficiency of research and the further development of the field. High-throughput sequencing adds to the avalanche of data available in the field; efficient computation and, in particular, parallel execution motivate the transition from traditional scripts and Makefiles to workflows. We here review the extant software development and distribution model with a focus on the role of integration testing and discuss the effect of common workflow language on distributions of open-source scientific software to swiftly and reliably provide the tools demanded for the execution of such formally described workflows. It is contended that, alleviated from technical differences for the execution on local machines, clusters or the cloud, communities also gain the technical means to test workflow-driven interaction across several software packages.",
issn="2364-1541",
doi="10.1007/s41019-017-0050-4"
}
@article{Gruening2018,
  doi = {10.12688/f1000research.15140.1},
  url = {https://doi.org/10.12688/f1000research.15140.1},
  year  = {2018},
  month = {jun},
  publisher = {F1000 Research,  Ltd.},
  volume = {7},
  pages = {742},
  author = {Bjorn Gruening and Olivier Sallou and Pablo Moreno and Felipe da Veiga Leprevost and Herv{\'{e}} M{\'{e}}nager and Dan S{\o}ndergaard and Hannes R\"{o}st and Timo Sachsenberg and Brian O{\textquotesingle}Connor and F{\'{a}}bio Madeira and Victoria Dominguez Del Angel and Michael R. Crusoe and Susheel Varma and Daniel Blankenberg and Rafael C. Jimenez and Yasset Perez-Riverol and},
  title = {Recommendations for the packaging and containerizing of bioinformatics software},
  journal = {F1000Research}
}

@article{Casati1998,
  doi = {10.1016/s0169-023x(97)00033-5},
  year  = {1998},
  month = {jan},
  publisher = {Elsevier {BV}},
  volume = {24},
  number = {3},
  pages = {211--238},
  author = {F Casati and S Ceri and B Pernici and G Pozzi},
  title = {Workflow evolution},
  journal = {Data {\&} Knowledge Engineering}
}

@article{docker,
 author = {Merkel, Dirk},
 title = {Docker: Lightweight Linux Containers for Consistent Development and Deployment},
 journal = {Linux Journal},
 issue_date = {March 2014},
 volume = {2014},
 number = {239},
 month = mar,
 year = {2014},
 issn = {1075-3583},
 articleno = {2},
 url = {https://www.linuxjournal.com/node/1335702},
 urldate = {2018-11-29},
 acmid = {2600241},
 publisher = {Belltown Media},
 address = {Houston, TX}
} 

@misc{cwl,
  doi = {10.6084/m9.figshare.3115156.v2},
  author = {Amstutz,  Peter and Crusoe,  Michael R. and {Nebojša Tijanić} and Chapman,  Brad and Chilton,  John and Heuer,  Michael and Kartashov,  Andrey and Leehr,  Dan and Ménager,  Hervé and Nedeljkovich,  Maya and Scales,  Matt and Soiland-Reyes,  Stian and Stojanovic,  Luka},
  keywords = {Bioinformatics,  Computational  Biology,  80301 Bioinformatics Software,  Computer Software,  80302 Computer System Architecture,  80501 Distributed and Grid Systems,  Distributed Computing},
  title = {Common Workflow Language,  v1.0},
  publisher = {Figshare},
  year = {2016}
}

@article{Alterovitz2019,
    author = {Alterovitz, Gil AND Dean, Dennis AND Goble, Carole AND Crusoe, Michael R. AND Soiland-Reyes, Stian AND Bell, Amanda AND Hayes, Anais AND Suresh, Anita AND Purkayastha, Anjan AND King, Charles H. AND Taylor, Dan AND Johanson, Elaine AND Thompson, Elaine E. AND Donaldson, Eric AND Morizono, Hiroki AND Tsang, Hsinyi AND Vora, Jeet K. AND Goecks, Jeremy AND Yao, Jianchao AND Almeida, Jonas S. AND Keeney, Jonathon AND Addepalli, KanakaDurga AND Krampis, Konstantinos AND Smith, Krista M. AND Guo, Lydia AND Walderhaug, Mark AND Schito, Marco AND Ezewudo, Matthew AND Guimera, Nuria AND Walsh, Paul AND Kahsay, Robel AND Gottipati, Srikanth AND Rodwell, Timothy C. AND Bloom, Toby AND Lai, Yuching AND Simonyan, Vahan AND Mazumder, Raja},
    journal = {PLOS Biology},
    publisher = {Public Library of Science},
    title = {Enabling precision medicine via standard communication of HTS provenance, analysis, and results},
    year = {2019},
    month = {12},
    volume = {16},
    url = {https://doi.org/10.1371/journal.pbio.3000099},
    pages = {1-14},
    abstract = {This Community Page article presents a communication standard for the provenance of high-throughput sequencing data; a BioCompute Object (BCO) can serve as a history of what was computed, be used as part of a validation process, or provide clarity and transparency of an experimental process to collaborators.},
    number = {12},
    doi = {10.1371/journal.pbio.3000099}
}

@article {Custovic799,
	author = {Custovic, Adnan and Ainsworth, John and Arshad, Hasan and Bishop, Christopher and Buchan, Iain and Cullinan, Paul and Devereux, Graham and Henderson, John and Holloway, John and Roberts, Graham and Turner, Steve and Woodcock, Ashley and Simpson, Angela},
	title = {The Study Team for Early Life Asthma Research (STELAR) consortium {\textquoteleft}Asthma e-lab{\textquoteright}: team science bringing data, methods and investigators together},
	volume = {70},
	number = {8},
	pages = {799--801},
	year = {2015},
	doi = {10.1136/thoraxjnl-2015-206781},
	publisher = {BMJ Publishing Group Ltd},
	abstract = {We created Asthma e-Lab, a secure web-based research environment to support consistent recording, description and sharing of data, computational/statistical methods and emerging findings across the five UK birth cohorts. The e-Lab serves as a data repository for our unified dataset and provides the computational resources and a scientific social network to support collaborative research. All activities are transparent, and emerging findings are shared via the e-Lab, linked to explanations of analytical methods, thus enabling knowledge transfer. eLab facilitates the iterative interdisciplinary dialogue between clinicians, statisticians, computer scientists, mathematicians, geneticists and basic scientists, capturing collective thought behind the interpretations of findings.},
	issn = {0040-6376},
	eprint = {https://thorax.bmj.com/content/70/8/799.full.pdf},
	journal = {Thorax}
}

@inproceedings{Chirigati2016,
  doi = {10.1145/2882903.2899401},
  year  = {2016},
  publisher = {{ACM} Press},
  author = {Fernando Chirigati and R{\'{e}}mi Rampin and Dennis Shasha and Juliana Freire},
  title = {{ReproZip}},
  booktitle = {Proceedings of the 2016 International Conference on Management of Data - {SIGMOD} {\textquotesingle}16}
}

@article{Bergmann2014,
  doi = {10.1186/s12859-014-0369-z},
  year  = {2014},
  month = {dec},
  publisher = {Springer Nature},
  volume = {15},
  number = {1},
  author = {Frank T Bergmann and Richard Adams and Stuart Moodie and Jonathan Cooper and Mihai Glont and Martin Golebiewski and Michael Hucka and Camille Laibe and Andrew K Miller and David P Nickerson and Brett G Olivier and Nicolas Rodriguez and Herbert M Sauro and Martin Scharm and Stian Soiland-Reyes and Dagmar Waltemath and Florent Yvon and Nicolas Le Nov{\`{e}}re},
  title = {{COMBINE} archive and {OMEX} format: one file to share all information to reproduce a modeling project},
  journal = {{BMC} Bioinformatics}
}

@article{Springate2014,
  doi = {10.1371/journal.pone.0099825},
  year  = {2014},
  month = {jun},
  publisher = {Public Library of Science ({PLoS})},
  volume = {9},
  number = {6},
  pages = {e99825},
  author = {David A. Springate and Evangelos Kontopantelis and Darren M. Ashcroft and Ivan Olier and Rosa Parisi and Edmore Chamapiwa and David Reeves},
  editor = {Irene Petersen},
  title = {{ClinicalCodes}: An Online Clinical Codes Repository to Improve the Validity and Reproducibility of Research Using Electronic Medical Records},
  journal = {{PLoS} {ONE}}
}

@incollection{Moreau2008,
  doi = {10.1007/978-3-540-89965-5_31},
  year  = {2008},
  publisher = {Springer Berlin Heidelberg},
  pages = {323--326},
  author = {Luc Moreau and Juliana Freire and Joe Futrelle and Robert E. McGrath and Jim Myers and Patrick Paulson},
  title = {The Open Provenance Model: An Overview},
  booktitle = {Lecture Notes in Computer Science}
}

@article{moreau2009governance,
  title={Governance of the open provenance model},
  author={Moreau, Luc and Freire, Juliana and Futrelle, Joe and Myers, Jim and Paulson, Patrick},
  url = {https://nms.kcl.ac.uk/luc.moreau/papers/governance.pdf},
  note = {Accessed 18 Sep 2018},
  year={2009},
  month={Jun},
  day={15}
}

@article{Moreau2015,
  doi = {10.1016/j.websem.2015.04.001},
  year  = {2015},
  month = {dec},
  publisher = {Elsevier {BV}},
  volume = {35},
  pages = {235--257},
  author = {Luc Moreau and Paul Groth and James Cheney and Timothy Lebo and Simon Miles},
  title = {The rationale of {PROV}},
  journal = {Web Semantics: Science,  Services and Agents on the World Wide Web}
}

@article{Kurtzer2017,
  doi = {10.1371/journal.pone.0177459},
  year  = {2017},
  month = {may},
  publisher = {Public Library of Science ({PLoS})},
  volume = {12},
  number = {5},
  pages = {e0177459},
  author = {Gregory M. Kurtzer and Vanessa Sochat and Michael W. Bauer},
  editor = {Attila Gursoy},
  title = {Singularity: Scientific containers for mobility of compute},
  journal = {{PLOS} {ONE}}
}

@inproceedings{Tolk,
  doi = {10.1109/pads.2006.39},
  publisher = {{IEEE}},
  author = {A. Tolk},
  title = {What Comes After the Semantic Web - {PADS} Implications for the Dynamic Web},
  booktitle = {20th Workshop on Principles of Advanced and Distributed Simulation ({PADS}{\textquotesingle}06)}
}

@article{Grning2018,
  doi = {10.1038/s41592-018-0046-7},
  year  = {2018},
  month = {jul},
  publisher = {Springer Nature},
  volume = {15},
  number = {7},
  pages = {475--476},
  author = {Bj\"{o}rn Gr\"{u}ning and   and Ryan Dale and Andreas Sj\"{o}din and Brad A. Chapman and Jillian Rowe and Christopher H. Tomkins-Tinch and Renan Valieris and Johannes K\"{o}ster},
  title = {Bioconda: sustainable and comprehensive software distribution for the life sciences},
  journal = {Nature Methods}
}

@article{Ivie2018,
 author = {Ivie, Peter and Thain, Douglas},
 title = {Reproducibility in Scientific Computing},
 journal = {ACM Comput. Surv.},
 issue_date = {July 2018},
 volume = {51},
 number = {3},
 month = jul,
 year = {2018},
 issn = {0360-0300},
 pages = {63:1--63:36},
 articleno = {63},
 numpages = {36},
 doi = {10.1145/3186266},
 acmid = {3186266},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Reproducibility, computational science, replicability, reproducible, scientific computing, scientific workflow, scientific workflows, workflow, workflows},
}

@article{sandve_2013,
title = {Ten simple rules for reproducible computational research.},
author = {Sandve, Geir Kjetil and Nekrutenko, Anton and Taylor, James and Hovig, Eivind},
pages = {e1003285},
year = {2013},
month = {oct},
day = {24},
urldate = {2018-07-13},
journal = {{PLoS} Comput Biol},
volume = {9},
number = {10},
doi = {10.1371/journal.pcbi.1003285},
pmid = {24204232},
pmcid = {PMC3812051},
f1000-projects = {CWLProv}
}

@article{Spjuth2015,
  doi = {10.1186/s13062-015-0071-8},
  year  = {2015},
  month = {aug},
  publisher = {Springer Nature},
  volume = {10},
  number = {1},
  author = {Ola Spjuth and Erik Bongcam-Rudloff and Guillermo Carrasco Hern{\'{a}}ndez and Lukas Forer and Mario Giovacchini and Roman Valls Guimera and Aleksi Kallio and Eija Korpelainen and Maciej M Ka{\'{n}}du{\l}a and Milko Krachunov and David P Kreil and Ognyan Kulev and Pawe{\l} P. {\L}abaj and Samuel Lampa and Luca Pireddu and Sebastian Sch\"{o}nherr and Alexey Siretskiy and Dimitar Vassilev},
  title = {Experiences with workflows for automating data-intensive bioinformatics},
  journal = {Biology Direct}
}

@article{Nekrutenko2012,
  doi = {10.1038/nrg3305},
  year  = {2012},
  month = {sep},
  publisher = {Springer Nature},
  volume = {13},
  number = {9},
  pages = {667--672},
  author = {Anton Nekrutenko and James Taylor},
  title = {Next-generation sequencing data interpretation: enhancing reproducibility and accessibility},
  journal = {Nature Reviews Genetics}
}
@article{Alper2018,
  doi = {10.3390/informatics5010011},
  year  = {2018},
  month = {feb},
  publisher = {{MDPI} {AG}},
  volume = {5},
  number = {1},
  pages = {11},
  author = {Pinar Alper and Khalid Belhajjame and Vasa Curcin and Carole Goble},
  title = {{LabelFlow} Framework for Annotating Workflow Provenance},
  journal = {Informatics}
}
@article{Ison2013,
  doi = {10.1093/bioinformatics/btt113},
  year  = {2013},
  month = {mar},
  publisher = {Oxford University Press ({OUP})},
  volume = {29},
  number = {10},
  pages = {1325--1332},
  author = {J. Ison and M. Kalas and I. Jonassen and D. Bolser and M. Uludag and H. McWilliam and J. Malone and R. Lopez and S. Pettifer and P. Rice},
  title = {{EDAM}: an ontology of bioinformatics operations,  types of data and identifiers,  topics and formats},
  journal = {Bioinformatics}
}

 @article{ludascher2016brief, title={A Brief Tour Through Provenance in Scientific Workflows and Databases}, ISBN={9783319402260}, ISSN={2198-7254}, doi={10.1007/978-3-319-40226-0_7}, journal={Springer Proceedings in Business and Economics}, publisher={Springer International Publishing}, author={Ludäscher, Bertram}, year={2016}, pages={103–126}}


@article{Littauer2012,
  doi = {10.2218/ijdc.v7i2.232},
  year  = {2012},
  month = {oct},
  publisher = {Edinburgh University Library},
  volume = {7},
  number = {2},
  pages = {92--100},
  author = {Richard Littauer and Karthik Ram and Bertram Lud\"{a}scher and William Michener and Rebecca Koskela},
  title = {Trends in Use of Scientific Workflows: Insights from a Public Repository and Recommendations for Best Practice},
  journal = {International Journal of Digital Curation}
}

@article{Gymrek2016,
  doi = {10.1186/s13742-016-0127-4},
  year  = {2016},
  month = {may},
  publisher = {Oxford University Press ({OUP})},
  volume = {5},
  number = {1},
  author = {Melissa Gymrek and Yossi Farjoun},
  title = {Recommendations for open data science},
  journal = {{GigaScience}}
}

@article{Stodden2014,
  doi = {10.5334/jors.ay},
  year  = {2014},
  month = {jul},
  publisher = {Ubiquity Press,  Ltd.},
  volume = {2},
  number = {1},
  author = {Victoria Stodden and Sheila Miguez},
  title = {Best Practices for Computational Science: Software Infrastructure and Environments for Reproducible and Extensible Research},
  journal = {Journal of Open Research Software}
}

@inproceedings{Zhao2012,
  doi = {10.1109/escience.2012.6404482},
  year  = {2012},
  month = {oct},
  publisher = {{IEEE}},
  author = {Jun Zhao and Jose Manuel Gomez-Perez and Khalid Belhajjame and Graham Klyne and Esteban Garcia-Cuesta and Aleix Garrido and Kristina Hettne and Marco Roos and David De Roure and Carole Goble},
  title = {Why workflows break \textendash Understanding and combating decay in Taverna workflows},
  booktitle = {2012 {IEEE} 8th International Conference on E-Science}
}


@article{garijo_2013,
title = {Quantifying reproducibility in computational biology: the case of the tuberculosis drugome.},
author = {Garijo, Daniel and Kinnings, Sarah and Xie, Li and Xie, Lei and Zhang, Yinliang and Bourne, Philip E and Gil, Yolanda},
pages = {e80278},
year = {2013},
month = {nov},
day = {27},
urldate = {2016-04-13},
journal = {{PLoS} {ONE}},
volume = {8},
number = {11},
doi = {10.1371/journal.pone.0080278},
pmid = {24312207},
pmcid = {PMC3842296}
}
@article{stephens_2015,
title = {Big data: astronomical or genomical?},
author = {Stephens, Zachary D and Lee, Skylar Y and Faghri, Faraz and Campbell, Roy H and Zhai, Chengxiang and Efron, Miles J and Iyer, Ravishankar and Schatz, Michael C and Sinha, Saurabh and Robinson, Gene E},
pages = {e1002195},
year = {2015},
month = {jul},
day = {7},
urldate = {2018-07-23},
journal = {{PLoS} Biol},
volume = {13},
number = {7},
issn = {1545-7885},
doi = {10.1371/journal.pbio.1002195},
pmid = {26151137},
pmcid = {PMC4494865},
f1000-projects = {CWLProv},
abstract = {Genomics is a Big Data science and is going to get much bigger, very soon, but it is not known whether the needs of genomics will exceed other Big Data domains. Projecting to the year 2025, we compared genomics with three other major generators of Big Data: astronomy, {YouTube}, and Twitter. Our estimates show that genomics is a "four-headed beast"--it is either on par with or the most demanding of the domains analyzed here in terms of data acquisition, storage, distribution, and analysis. We discuss aspects of new technologies that will need to be developed to rise up and meet the computational challenges that genomics poses for the near future. Now is the time for concerted, community-wide planning for the "genomical" challenges of the next decade.}
}
@article{gonzlezbeltrn_2015,
title = {From Peer-Reviewed to Peer-Reproduced in Scholarly Publishing: The Complementary Roles of Data Models and Workflows in Bioinformatics.},
author = {González-Beltrán, Alejandra and Li, Peter and Zhao, Jun and Avila-Garcia, Maria Susana and Roos, Marco and Thompson, Mark and van der Horst, Eelke and Kaliyaperumal, Rajaram and Luo, Ruibang and Lee, Tin-Lap and Lam, Tak-Wah and Edmunds, Scott C and Sansone, Susanna-Assunta and Rocca-Serra, Philippe},
pages = {e0127612},
url = {http://dx.plos.org/10.1371/journal.pone.0127612},
year = {2015},
month = {jul},
day = {8},
urldate = {2018-07-23},
journal = {{PLoS} {ONE}},
volume = {10},
number = {7},
issn = {1932-6203},
doi = {10.1371/journal.pone.0127612},
pmid = {26154165},
pmcid = {PMC4495984},
f1000-projects = {CWLProv},
abstract = {{MOTIVATION}: Reproducing the results from a scientific paper can be challenging due to the absence of data and the computational tools required for their analysis. In addition, details relating to the procedures used to obtain the published results can be difficult to discern due to the use of natural language when reporting how experiments have been performed. The Investigation/Study/Assay ({ISA}), Nanopublications ({NP}), and Research Objects ({RO}) models are conceptual data modelling frameworks that can structure such information from scientific papers. Computational workflow platforms can also be used to reproduce analyses of data in a principled manner. We assessed the extent by which {ISA}, {NP}, and {RO} models, together with the Galaxy workflow system, can capture the experimental processes and reproduce the findings of a previously published paper reporting on the development of {SOAPdenovo2}, a de novo genome assembler. {RESULTS}: Executable workflows were developed using Galaxy, which reproduced results that were consistent with the published findings. A structured representation of the information in the {SOAPdenovo2} paper was produced by combining the use of {ISA}, {NP}, and {RO} models. By structuring the information in the published paper using these data and scientific workflow modelling frameworks, it was possible to explicitly declare elements of experimental design, variables, and findings. The models served as guides in the curation of scientific information and this led to the identification of inconsistencies in the original published paper, thereby allowing its authors to publish corrections in the form of an errata. {AVAILABILITY}: {SOAPdenovo2} scripts, data, and results are available through the {GigaScience} Database: http://dx.doi.org/10.5524/100044; the workflows are available from {GigaGalaxy}: http://galaxy.cbiit.cuhk.edu.hk; and the representations using the {ISA}, {NP}, and {RO} models are available through the {SOAPdenovo2} case study website http://isa-tools.github.io/soapdenovo2/. {CONTACT}: philippe.rocca-serra@oerc.ox.ac.uk and susanna-assunta.sansone@oerc.ox.ac.uk.}
}
@article{ciccarese_2013,
title = {{PAV} ontology: provenance, authoring and versioning.},
author = {Ciccarese, Paolo and Soiland-Reyes, Stian and Belhajjame, Khalid and Gray, Alasdair Jg and Goble, Carole and Clark, Tim},
pages = {37},
url = {http://dx.doi.org/10.1186/2041-1480-4-37},
year = {2013},
month = {nov},
day = {22},
urldate = {2017-08-16},
journal = {J Biomed Semantics},
volume = {4},
number = {1},
doi = {10.1186/2041-1480-4-37},
pmid = {24267948},
pmcid = {PMC4177195},
f1000-projects = {{CWLProv} and Your publications},
abstract = {{BACKGROUND}: Provenance is a critical ingredient for establishing trust of published scientific content. This is true whether we are considering a data set, a computational workflow, a peer-reviewed publication or a simple scientific claim with supportive evidence. Existing vocabularies such as Dublin Core Terms ({DC} Terms) and the {W3C} Provenance Ontology ({PROV}-O) are domain-independent and general-purpose and they allow and encourage for extensions to cover more specific needs. In particular, to track authoring and versioning information of web resources, {PROV}-O provides a basic methodology but not any specific classes and properties for identifying or distinguishing between the various roles assumed by agents manipulating digital artifacts, such as author, contributor and curator. {RESULTS}: We present the Provenance, Authoring and Versioning ontology ({PAV}, namespace http://purl.org/pav/): a lightweight ontology for capturing "just enough" descriptions essential for tracking the provenance, authoring and versioning of web resources. We argue that such descriptions are essential for digital scientific content. {PAV} distinguishes between contributors, authors and curators of content and creators of representations in addition to the provenance of originating resources that have been accessed, transformed and consumed. We explore five projects (and communities) that have adopted {PAV} illustrating their usage through concrete examples. Moreover, we present mappings that show how {PAV} extends the {W3C} {PROV}-O ontology to support broader interoperability. {METHOD}: The initial design of the {PAV} ontology was driven by requirements from the {AlzSWAN} project with further requirements incorporated later from other projects detailed in this paper. The authors strived to keep {PAV} lightweight and compact by including only those terms that have demonstrated to be pragmatically useful in existing applications, and by recommending terms from existing ontologies when plausible. {DISCU\SSION}: We analyze and compare {PAV} with related approaches, namely Provenance Vocabulary ({PRV}), {DC} Terms and {BIBFRAME}. We identify similarities and analyze differences between those vocabularies and {PAV}, outlining strengths and weaknesses of our proposed model. We specify {SKOS} mappings that align {PAV} with {DC} Terms. We conclude the paper with general remarks on the applicability of {PAV}.}
}
@article{wolstencroft_2013,
title = {The Taverna workflow suite: designing and executing workflows of Web Services on the desktop, web or in the cloud.},
author = {Wolstencroft, Katherine and Haines, Robert and Fellows, Donal and Williams, Alan and Withers, David and Owen, Stuart and Soiland-Reyes, Stian and Dunlop, Ian and Nenadic, Aleksandra and Fisher, Paul and Bhagat, Jiten and Belhajjame, Khalid and Bacall, Finn and Hardisty, Alex and Nieva de la Hidalga, Abraham and Balcazar Vargas, Maria P and Sufi, Shoaib and Goble, Carole},
pages = {W557-61},
year = {2013},
month = {jul},
journal = {Nucleic Acids Res},
volume = {41},
number = {Web Server issue},
doi = {10.1093/nar/gkt328},
pmid = {23640334},
pmcid = {PMC3692062},
f1000-projects = {{CWL} and {CWLProv} and Debianpaper and Your publications},
abstract = {The Taverna workflow tool suite (http://www.taverna.org.uk) is designed to combine distributed Web Services and/or local tools into complex analysis pipelines. These pipelines can be executed on local desktop machines or through larger infrastructure (such as supercomputers, Grids or cloud environments), using the Taverna Server. In bioinformatics, Taverna workflows are typically used in the areas of high-throughput omics analyses (for example, proteomics or transcriptomics), or for evidence gathering methods involving text mining or data mining. Through Taverna, scientists have access to several thousand different tools and resources that are freely available from a large range of life science institutions. Once constructed, the workflows are reusable, executable bioinformatics protocols that can be shared, reused and repurposed. A repository of public workflows is available at http://www.myexperiment.org. This article provides an update to the Taverna tool suite, highlighting new features and developments in the workbench and the Taverna Server.}
}
@article{peng_2011,
title = {Reproducible research in computational science.},
author = {Peng, Roger D},
pages = {1226-1227},
year = {2011},
month = {dec},
day = {2},
journal = {Science},
volume = {334},
number = {6060},
doi = {10.1126/science.1213847},
pmid = {22144613},
pmcid = {PMC3383002},
f1000-projects = {CWLProv},
abstract = {Computational science has led to exciting new developments, but the nature of the work has exposed limitations in our ability to evaluate published findings. Reproducibility has the potential to serve as a minimum standard for judging scientific claims when full independent replication of a study is not possible.}
}
@article{ison_2013,
title = {{EDAM}: an ontology of bioinformatics operations, types of data and identifiers, topics and formats.},
author = {Ison, Jon and Kalas, Matús and Jonassen, Inge and Bolser, Dan and Uludag, Mahmut and {McWilliam}, Hamish and Malone, James and Lopez, Rodrigo and Pettifer, Steve and Rice, Peter},
pages = {1325-1332},
year = {2013},
month = {may},
day = {15},
journal = {Bioinformatics},
volume = {29},
number = {10},
doi = {10.1093/bioinformatics/btt113},
pmid = {23479348},
pmcid = {PMC3654706},
f1000-projects = {{CWLProv} and Debianpaper},
abstract = {{MOTIVATION}: Advancing the search, publication and integration of bioinformatics tools and resources demands consistent machine-understandable descriptions. A comprehensive ontology allowing such descriptions is therefore required. {RESULTS}: {EDAM} is an ontology of bioinformatics operations (tool or workflow functions), types of data and identifiers, application domains and data formats. {EDAM} supports semantic annotation of diverse entities such as Web services, databases, programmatic libraries, standalone tools, interactive applications, data schemas, datasets and publications within bioinformatics. {EDAM} applies to organizing and finding suitable tools and data and to automating their integration into complex applications or workflows. It includes over 2200 defined concepts and has successfully been used for annotations and implementations. {AVAILABILITY}: The latest stable version of {EDAM} is available in {OWL} format from http://edamontology.org/{EDAM}.owl and in {OBO} format from http://edamontology.org/{EDAM}.obo. It can be viewed online at the {NCBO} {BioPortal} and the {EBI} Ontology Lookup Service. For documentation and license please refer to http://edamontology.org. This article describes version 1.2 available at http://edamontology.org/{EDAM\_1}.2.owl. {CONTACT}: jison@ebi.ac.uk.}
}
@article{muse_1994,
title = {A likelihood approach for comparing synonymous and nonsynonymous nucleotide substitution rates, with application to the chloroplast genome.},
author = {Muse, S V and Gaut, B S},
pages = {715-724},
year = {1994},
month = {sep},
journal = {Mol Biol Evol},
volume = {11},
number = {5},
doi = {10.1093/oxfordjournals.molbev.a040152},
pmid = {7968485},
f1000-projects = {CWLProv},
abstract = {A model of {DNA} sequence evolution applicable to coding regions is presented. This represents the first evolutionary model that accounts for dependencies among nucleotides within a codon. The model uses the codon, as opposed to the nucleotide, as the unit of evolution, and is parameterized in terms of synonymous and nonsynonymous nucleotide substitution rates. One of the model's advantages over those used in methods for estimating synonymous and nonsynonymous substitution rates is that it completely corrects for multiple hits at a codon, rather than taking a parsimony approach and considering only pathways of minimum change between homologous codons. Likelihood-ratio versions of the relative-rate test are constructed and applied to data from the complete chloroplast {DNA} sequences of Oryza sativa, Nicotiana tabacum, and Marchantia polymorpha. Results of these tests confirm previous findings that substitution rates in the chloroplast genome are subject to both lineage-specific and locus-specific effects. Additionally, the new tests suggest tha the rate heterogeneity is due primarily to differences in nonsynonymous substitution rates. Simulations help confirm previous suggestions that silent sites are saturated, leaving no evidence of heterogeneity in synonymous substitution rates.}
}
@article{wilkinson_2016,
title = {The {FAIR} Guiding Principles for scientific data management and stewardship.},
author = {Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, I Jsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and Bouwman, Jildau and Brookes, Anthony J and Clark, Tim and Crosas, Mercè and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J G and Groth, Paul and Goble, Carole and Grethe, Jeffrey S and Heringa, Jaap and 't Hoen, Peter A C and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J and Martone, Maryann E and Mons, Albert and Packer, Abel L and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend},
pages = {160018},
year = {2016},
month = {mar},
day = {15},
urldate = {2018-07-13},
journal = {Sci Data},
volume = {3},
issn = {2052-4463},
doi = {10.1038/sdata.2016.18},
pmid = {26978244},
pmcid = {PMC4792175},
f1000-projects = {CWLProv},
abstract = {There is an urgent need to improve the infrastructure supporting the reuse of scholarly data. A diverse set of stakeholders-representing academia, industry, funding agencies, and scholarly publishers-have come together to design and jointly endorse a concise and measureable set of principles that we refer to as the {FAIR} Data Principles. The intent is that these may act as a guideline for those wishing to enhance the reusability of their data holdings. Distinct from peer initiatives that focus on the human scholar, the {FAIR} Principles put specific emphasis on enhancing the ability of machines to automatically find and use the data, in addition to supporting its reuse by individuals. This Comment is the first formal publication of the {FAIR} Principles, and includes the rationale behind them, and some exemplar implementations in the community.}
}
@article{freire_2012,
title = {Making Computations and Publications Reproducible with {VisTrails}},
author = {Freire, Juliana and Silva, Claudio T.},
pages = {18-25},
year = {2012},
month = {jul},
journal = {Comput Sci Eng},
volume = {14},
number = {4},
issn = {1521-9615},
doi = {10.1109/{MCSE}.2012.76},
f1000-projects = {CWLProv}
}
@article{afgan_2016,
title = {The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update.},
author = {Afgan, Enis and Baker, Dannon and van den Beek, Marius and Blankenberg, Daniel and Bouvier, Dave and Čech, Martin and Chilton, John and Clements, Dave and Coraor, Nate and Eberhard, Carl and Grüning, Björn and Guerler, Aysam and Hillman-Jackson, Jennifer and Von Kuster, Greg and Rasche, Eric and Soranzo, Nicola and Turaga, Nitesh and Taylor, James and Nekrutenko, Anton and Goecks, Jeremy},
pages = {W3-W10},
year = {2016},
month = {jul},
day = {8},
journal = {Nucleic Acids Res},
volume = {44},
number = {W1},
doi = {10.1093/nar/gkw343},
pmid = {27137889},
pmcid = {PMC4987906},
f1000-projects = {{CWL} and {CWLProv} and Debianpaper},
abstract = {High-throughput data production technologies, particularly 'next-generation' {DNA} sequencing, have ushered in widespread and disruptive changes to biomedical research. Making sense of the large datasets produced by these technologies requires sophisticated statistical and computational methods, as well as substantial computational power. This has led to an acute crisis in life sciences, as researchers without informatics training attempt to perform computation-dependent analyses. Since 2005, the Galaxy project has worked to address this problem by providing a framework that makes advanced computational tools usable by non experts. Galaxy seeks to make data-intensive research more accessible, transparent and reproducible by providing a Web-based environment in which users can perform computational analyses and have all of the details automatically tracked for later inspection, publication, or reuse. In this report we highlight recently added features enabling biomedical analyses on a large scale. \copyright The Author(s) 2016. Published by Oxford University Press on behalf of Nucleic Acids Research.}
}
@article{leipzig_2017,
title = {A review of bioinformatic pipeline frameworks.},
author = {Leipzig, Jeremy},
pages = {530-536},
year = {2017},
month = {may},
day = {1},
journal = {Brief Bioinformatics},
volume = {18},
number = {3},
doi = {10.1093/bib/bbw020},
pmid = {27013646},
pmcid = {PMC5429012},
f1000-projects = {{CWL} and {CWLProv} and Debianpaper},
abstract = {High-throughput bioinformatic analyses increasingly rely on pipeline frameworks to process sequence and metadata. Modern implementations of these frameworks differ on three key dimensions: using an implicit or explicit syntax, using a configuration, convention or class-based design paradigm and offering a command line or workbench interface. Here I survey and compare the design philosophies of several current pipeline frameworks. I provide practical recommendations based on analysis requirements and the user base. \copyright The Author 2016. Published by Oxford University Press.}
}
@article{smith_2016,
title = {Software citation principles},
author = {Smith, Arfon M. and Katz, Daniel S. and Niemeyer, Kyle E. and {FORCE11} Software Citation Working Group},
pages = {e86},
url = {https://peerj.com/articles/cs-86},
year = {2016},
month = {sep},
day = {19},
urldate = {2017-09-26},
journal = {{PeerJ} Computer Science},
volume = {2},
issn = {2376-5992},
doi = {10.7717/peerj-cs.86},
f1000-projects = {{CWLProv} and Reproducibility}
}
@article{bergmann_2014,
title = {{COMBINE} archive and {OMEX} format: one file to share all information to reproduce a modeling project.},
author = {Bergmann, Frank T and Adams, Richard and Moodie, Stuart and Cooper, Jonathan and Glont, Mihai and Golebiewski, Martin and Hucka, Michael and Laibe, Camille and Miller, Andrew K and Nickerson, David P and Olivier, Brett G and Rodriguez, Nicolas and Sauro, Herbert M and Scharm, Martin and Soiland-Reyes, Stian and Waltemath, Dagmar and Yvon, Florent and Le Novère, Nicolas},
pages = {369},
url = {http://dx.doi.org/10.1186/s12859-014-0369-z},
year = {2014},
month = {dec},
day = {14},
urldate = {2017-08-16},
journal = {{BMC} Bioinformatics},
volume = {15},
number = {1},
doi = {10.1186/s12859-014-0369-z},
pmid = {25494900},
pmcid = {PMC4272562},
f1000-projects = {{CWLProv} and Your publications},
abstract = {{BACKGROUND}: With the ever increasing use of computational models in the biosciences, the need to share models and reproduce the results of published studies efficiently and easily is becoming more important. To this end, various standards have been proposed that can be used to describe models, simulations, data or other essential information in a consistent fashion. These constitute various separate components required to reproduce a given published scientific result. {RESULTS}: We describe the Open Modeling {EXchange} format ({OMEX}). Together with the use of other standard formats from the Computational Modeling in Biology Network ({COMBINE}), {OMEX} is the basis of the {COMBINE} Archive, a single file that supports the exchange of all the information necessary for a modeling and simulation experiment in biology. An {OMEX} file is a {ZIP} container that includes a manifest file, listing the content of the archive, an optional metadata file adding information about the archive and its content, and the files describing the model. The content of a {COMBINE} Archive consists of files encoded in {COMBINE} standards whenever possible, but may include additional files defined by an Internet Media Type. Several tools that support the {COMBINE} Archive are available, either as independent libraries or embedded in modeling software. {CONCLUSIONS}: The {COMBINE} Archive facilitates the reproduction of modeling and simulation experiments in biology by embedding all the relevant information in one file. Having all the information stored and exchanged at once also helps in building activity logs and audit trails. We anticipate that the {COMBINE} Archive will become a significant help for modellers, as the domain moves to larger, more complex experiments such as multi-scale models of organs, digital organisms, and bioengineering.}
}
@inproceedings{garijo_2011,
title = {A new approach for publishing workflows: Abstractions, standards, and linked data},
author = {Garijo, Daniel and Gil, Yolanda},
pages = {47},
publisher = {{ACM} Press},
url = {http://dl.acm.org/citation.cfm?doid=2110497.2110504},
year = {2011},
month = {nov},
day = {14},
urldate = {2018-07-13},
isbn = {9781450311007},
doi = {10.1145/2110497.2110504},
address = {New York, New York, {USA}},
f1000-projects = {CWLProv},
booktitle = {Proceedings of the 6th workshop on Workflows in support of large-scale science - {WORKS} '11}
}
@article{kaushik_2017,
title = {Rabix: an open-source workflow executor supporting recomputability and interoperability of workflow descriptions.},
author = {Kaushik, Gaurav and Ivkovic, Sinisa and Simonovic, Janko and Tijanic, Nebojsa and Davis-Dusenbery, Brandi and Kural, Deniz},
pages = {154-165},
year = {2017},
urldate = {2017-11-03},
journal = {Pac Symp Biocomput},
volume = {22},
doi = {10.1142/9789813207813\_0016},
pmid = {27896971},
pmcid = {PMC5166558},
f1000-projects = {{CWL} and {CWLProv}},
abstract = {As biomedical data has become increasingly easy to generate in large quantities, the methods used to analyze it have proliferated rapidly. Reproducible and reusable methods are required to learn from large volumes of data reliably. To address this issue, numerous groups have developed workflow specifications or execution engines, which provide a framework with which to perform a sequence of analyses. One such specification is the Common Workflow Language, an emerging standard which provides a robust and flexible framework for describing data analysis tools and workflows. In addition, reproducibility can be furthered by executors or workflow engines which interpret the specification and enable additional features, such as error logging, file organization, optim1izations to computation and job scheduling, and allow for easy computing on large volumes of data. To this end, we have developed the Rabix Executor, an open-source workflow engine for the purposes of improving reproducibility through reusability and interoperability of workflow descriptions.}
}
@article{stodden_2016,
title = {Enhancing reproducibility for computational methods.},
author = {Stodden, Victoria and {McNutt}, Marcia and Bailey, David H and Deelman, Ewa and Gil, Yolanda and Hanson, Brooks and Heroux, Michael A and Ioannidis, John P A and Taufer, Michela},
pages = {1240-1241},
year = {2016},
month = {dec},
day = {9},
urldate = {2018-07-23},
journal = {Science},
volume = {354},
number = {6317},
issn = {0036-8075},
doi = {10.1126/science.aah6168},
pmid = {27940837},
f1000-projects = {CWLProv}
}
@article{oconnor_2017,
title = {The Dockstore: enabling modular, community-focused sharing of Docker-based genomics tools and workflows.},
author = {O'Connor, Brian D and Yuen, Denis and Chung, Vincent and Duncan, Andrew G and Liu, Xiang Kun and Patricia, Janice and Paten, Benedict and Stein, Lincoln and Ferretti, Vincent},
pages = {52},
year = {2017},
month = {jan},
day = {18},
urldate = {2018-07-13},
journal = {F1000Res},
volume = {6},
doi = {10.12688/f1000research.10137.1},
pmid = {28344774},
pmcid = {PMC5333608},
f1000-projects = {{CWLProv} and Debianpaper},
abstract = {As genomic datasets continue to grow, the feasibility of downloading data to a local organization and running analysis on a traditional compute environment is becoming increasingly problematic. Current large-scale projects, such as the {ICGC} {PanCancer} Analysis of Whole Genomes ({PCAWG}), the Data Platform for the U.S. Precision Medicine Initiative, and the {NIH} Big Data to Knowledge Center for Translational Genomics, are using cloud-based infrastructure to both host and perform analysis across large data sets. In {PCAWG}, over 5,800 whole human genomes were aligned and variant called across 14 cloud and {HPC} environments; the processed data was then made available on the cloud for further analysis and sharing. If run locally, an operation at this scale would have monopolized a typical academic data centre for many months, and would have presented major challenges for data storage and distribution. However, this scale is increasingly typical for genomics projects and necessitates a rethink of how analytical tools are packaged and moved to the data. For {PCAWG}, we embraced the use of highly portable Docker images for encapsulating and sharing complex alignment and variant calling workflows across highly variable environments. While successful, this endeavor revealed a limitation in Docker containers, namely the lack of a standardized way to describe and execute the tools encapsulated inside the container. As a result, we created the Dockstore ( https://dockstore.org), a project that brings together Docker images with standardized, machine-readable ways of describing and running the tools contained within. This service greatly improves the sharing and reuse of genomics tools and promotes interoperability with similar projects through emerging web service standards developed by the Global Alliance for Genomics and Health ({GA4GH}).}
}
@article{ditommaso_2017,
title = {Nextflow enables reproducible computational workflows.},
author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric},
pages = {316-319},
year = {2017},
month = {apr},
day = {11},
urldate = {2018-07-13},
journal = {Nat Biotechnol},
volume = {35},
number = {4},
doi = {10.1038/nbt.3820},
pmid = {28398311},
f1000-projects = {{CWL} and {CWLProv} and Debianpaper}
}

@online{nextflow_tracing,
 title = {Nextflow: Tracing and visualization },
 author = {{Centre for Genomic Regulation (CRG)}},
 year = 2018,
 url = {https://www.nextflow.io/docs/latest/tracing.html#trace-report},
 urldate = {2018-11-28},
 note = {Accessed 28 Nov 2018}
}

@misc{NogalesPoster2018,
author = "Garriga Nogales, Edgar and Di Tommaso, Paolo and Notredame, Cedric",
title = "{{Nextflow integration for the Research Object Specification}}",
publisher = "Figshare",
year = "2018",
month = "10",
note = { Poster at Workshop on Research Objects (RO2018)},
doi = "10.5281/zenodo.1472384"
}

@proceedings{nogales_abstract_2018,
  title        = {{Nextflow integration for the Research Object 
                   Specification}},
  year         = 2018,
  publisher    = {Zenodo},
  month        = jul,
  note         = {{Poster. At Workshop on Research Objects (RO 2018), 29 Oct 2018, Amsterdam, Netherlands.}},
  doi          = {10.5281/zenodo.1472385}
}

@article{guimera_2012,
title = {bcbio-nextgen: Automated, distributed next-gen sequencing pipeline},
author = {Guimera, Roman Valls},
pages = {30},
year = {2012},
month = {feb},
day = {28},
urldate = {2017-09-07},
journal = {{EMBnet} j.},
volume = {17},
number = {B},
issn = {2226-6089},
doi = {10.14806/ej.17.B.286},
f1000-projects = {{CWL} and {CWLProv} and Debianpaper}
}
@article{hettne_2014,
title = {Structuring research methods and data with the research object model: genomics workflows as a case study.},
author = {Hettne, Kristina M and Dharuri, Harish and Zhao, Jun and Wolstencroft, Katherine and Belhajjame, Khalid and Soiland-Reyes, Stian and Mina, Eleni and Thompson, Mark and Cruickshank, Don and Verdes-Montenegro, Lourdes and Garrido, Julian and de Roure, David and Corcho, Oscar and Klyne, Graham and van Schouwen, Reinout and 't Hoen, Peter A C and Bechhofer, Sean and Goble, Carole and Roos, Marco},
pages = {41},
year = {2014},
month = {sep},
day = {18},
urldate = {2018-07-13},
journal = {J Biomed Semantics},
volume = {5},
number = {1},
doi = {10.1186/2041-1480-5-41},
pmid = {25276335},
pmcid = {PMC4177597},
f1000-projects = {{CWLProv} and Your publications},
abstract = {{BACKGROUND}: One of the main challenges for biomedical research lies in the computer-assisted integrative study of large and increasingly complex combinations of data in order to understand molecular mechanisms. The preservation of the materials and methods of such computational experiments with clear annotations is essential for understanding an experiment, and this is increasingly recognized in the bioinformatics community. Our assumption is that offering means of digital, structured aggregation and annotation of the objects of an experiment will provide necessary meta-data for a scientist to understand and recreate the results of an experiment. To support this we explored a model for the semantic description of a workflow-centric Research Object ({RO}), where an {RO} is defined as a resource that aggregates other resources, e.g., datasets, software, spreadsheets, text, etc. We applied this model to a case study where we analysed human metabolite variation by workflows. {RESULTS}: We present the application of the workflow-centric {RO} model for our bioinformatics case study. Three workflows were produced following recently defined Best Practices for workflow design. By modelling the experiment as an {RO}, we were able to automatically query the experiment and answer questions such as "which particular data was input to a particular workflow to test a particular hypothesis?", and "which particular conclusions were drawn from a particular workflow?". {CONCLUSIONS}: Applying a workflow-centric {RO} model to aggregate and annotate the resources used in a bioinformatics experiment, allowed us to retrieve the conclusions of the experiment in the context of the driving hypothesis, the executed workflows and their input data. The {RO} model is an extendable reference model that can be used by other systems as well. {AVAILABILITY}: The Research Object is available at http://www.myexperiment.org/packs/428 The {Wf4Ever} Research Object Model is available at https://w3id.org/ro/2016-01-28/.}
}
@article{kurtzer_2017,
title = {Singularity: Scientific containers for mobility of compute.},
author = {Kurtzer, Gregory M and Sochat, Vanessa and Bauer, Michael W},
pages = {e0177459},
year = {2017},
month = {may},
day = {11},
urldate = {2017-11-03},
journal = {{PLoS} {ONE}},
volume = {12},
number = {5},
doi = {10.1371/journal.pone.0177459},
pmid = {28494014},
pmcid = {PMC5426675},
f1000-projects = {{CWLProv} and Debianpaper},
abstract = {Here we present Singularity, software developed to bring containers and reproducibility to scientific computing. Using Singularity containers, developers can work in reproducible environments of their choosing and design, and these complete environments can easily be copied and executed on other platforms. Singularity is an open source initiative that harnesses the expertise of system and software engineers and researchers alike, and integrates seamlessly into common workflows for both of these groups. As its primary use case, Singularity brings mobility of computing to both users and {HPC} centers, providing a secure means to capture and distribute software and compute environments. This ability to create and deploy reproducible environments across these centers, a previously unmet need, makes Singularity a game changing development for computational science.}
}
@inproceedings{robinson_2017,
title = {Common Workflow Language Viewer},
author = {Robinson, Mark and Soiland-Reyes, Stian and Crusoe, Michael R. and Goble, Carole},
url = {https://view.commonwl.org/},
year = {2017},
month = {jul},
day = {22},
urldate = {2017-08-16},
f1000-projects = {{CWLProv} and Your publications},
abstract = {The Common Workflow Language ({CWL}) project emerged from the {BOSC} 2014 Codefest as a grassroots, multi-vendor working group to tackle the portability of data analysis workflows. It\textquoterights specification for describing workflows and command line tools aims to make them portable and scalable across a variety of computing platforms. At its heart {CWL} is a set of structured text files ({YAML}) with various extensibility points to the format. However, the {CWL} syntax and multi-file collections are not conducive to workflow browsing, exchange and understanding: for this we need a visualization suite.{CWL} Viewer is a richly featured {CWL} visualization suite that graphically presents and lists the details of {CWL} workflows with their inputs, outputs and steps. It also packages the {CWL} files into a downloadable Research Object Bundle including attribution, versioning and dependency metadata in the manifest, allowing it to be easily shared. The tool operates over any workflow held in a {GitHub} repository. Other features include: path visualization from parents and children nodes; nested workflows support; workflow graph download in a range of image formats; a gallery of previously submitted workflows; and support for private git repositories and public {GitHub} including live updates over versioned workflows. The {CWL} Viewer is the de facto {CWL} visualization suite and has been enthusiastically received by the {CWL} community.Project Website: https://view.commonwl.org/Source Code: https://github.com/common-workflow-language/cwlviewer(https://doi.org/10.5281/zenodo.823535)Software License: Apache License, Version 2.{0Submitted} abstract: {CWL} Viewer: The Common Workflow Language {ViewerPoster}: https://doi.org/10.7490/f1000research.1114375.{1Technical} Report: Reproducible Research using Research Objects(https://doi.org/10.5281/zenodo.823295)}
}
@inproceedings{robinson_2017a,
title = {{CWL} Viewer},
author = {Robinson, Mark and Soiland-Reyes, Stian and Crusoe, Michael R. and Goble, Carole},
url = {https://view.commonwl.org/},
year = {2017},
month = {jul},
day = {6},
urldate = {2017-08-16},
f1000-projects = {{CWLProv} and Your publications},
abstract = {The Common Workflow Language ({CWL}) project emerged from the {BOSC} 2014 Codefest as a grassroots, multi-vendor working group to tackle the portability of data analysis workflows. It\textquoterights specification for describing workflows and command line tools aims to make them portable and scalable across a variety of computing platforms. At its heart {CWL} is a set of structured text files ({YAML}) with various extensibility points to the format. However, the {CWL} syntax and multi-file collections are not conducive to workflow browsing, exchange and understanding: for thiswe need a visualization suite. {CWL} Viewer is a richly featured {CWL} visualization suite that graphically presents and lists the details of {CWL} workflows with their inputs, outputs and steps. It also packages the {CWL} files into a downloadable Research Object Bundle including attribution, versioning and dependency metadata in the manifest, allowing it to be easily shared. The tool operates over any workflow held in a {GitHub} repository. Other features include: path visualization from parents and children nodes; nested workflows support; workflow graph download in a range of image formats; a gallery of previously submitted workflows; and support for private git repositories and public {GitHub} including live updates over versioned workflows. The {CWL} Viewer is the de facto {CWL} visualization suite and has been enthusiastically received by the {CWL} community}
}
@article{mcmurry_2017,
title = {Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data.},
author = {{McMurry}, Julie A and Juty, Nick and Blomberg, Niklas and Burdett, Tony and Conlin, Tom and Conte, Nathalie and Courtot, Mélanie and Deck, John and Dumontier, Michel and Fellows, Donal K and Gonzalez-Beltran, Alejandra and Gormanns, Philipp and Grethe, Jeffrey and Hastings, Janna and Hériché, Jean-Karim and Hermjakob, Henning and Ison, Jon C and Jimenez, Rafael C and Jupp, Simon and Kunze, John and Laibe, Camille and Le Novère, Nicolas and Malone, James and Martin, Maria Jesus and {McEntyre}, Johanna R and Morris, Chris and Muilu, Juha and Müller, Wolfgang and Rocca-Serra, Philippe and Sansone, Susanna-Assunta and Sariyar, Murat and Snoep, Jacky L and Soiland-Reyes, Stian and Stanford, Natalie J and Swainston, Neil and Washington, Nicole and Williams, Alan R and Wimalaratne, Sarala M and Winfree, Lilly M and Wolstencroft, Katherine and Goble, Carole and Mungall, Christopher J and Haendel, Melissa A and Parkinson, Helen},
pages = {e2001414},
year = {2017},
month = {jun},
day = {29},
urldate = {2018-07-13},
journal = {{PLoS} Biol},
volume = {15},
number = {6},
doi = {10.1371/journal.pbio.2001414},
pmid = {28662064},
pmcid = {PMC5490878},
f1000-projects = {{CWLProv} and Your publications},
abstract = {In many disciplines, data are highly decentralized across thousands of online databases (repositories, registries, and knowledgebases). Wringing value from such databases depends on the discipline of data science and on the humble bricks and mortar that make integration possible; identifiers are a core component of this integration infrastructure. Drawing on our experience and on work by other groups, we outline 10 lessons we have learned about the identifier qualities and best practices that facilitate large-scale data integration. Specifically, we propose actions that identifier practitioners (database providers) should take in the design, provision and reuse of identifiers. We also outline the important considerations for those referencing identifiers in various circumstances, including by authors and data generators. While the importance and relevance of each lesson will vary by context, there is a need for increased awareness about how to avoid and manage common identifier problems, especially those related to persistence and web-accessibility/resolvability. We focus strongly on web-based identifiers in the life sciences; however, the principles are broadly relevant to other disciplines.}
}
@inproceedings{khan_2017,
title = {{CWL}+Research Object == Complete Provenance},
author = {Khan, Farah Zaib and Soiland-Reyes, Stian and Lonie, Andrew and Sinnott, Richard},
url = {https://github.com/common-workflow-language/common-workflow-language/wiki/Research-Object-Proposal},
year = {2017},
month = {jun},
day = {14},
urldate = {2017-08-16},
f1000-projects = {{CWLProv} and Your publications},
abstract = {The term Provenance is referred to as {\textquoteleftThe} beginning of something\textquoterights existence; something\textquoterights origin\textquoteright Or {\textquoteleftA} record of ownership of a work of art or an antique, used as a guide to authenticity or quality\textquoteright. Provenance tracking is crucial in scientific studies where workflows have emerged as an exemplar approach to mechanize data-intensive analyses. Gil et al. analyze challenges of scientific workflows and concluded that formally specified workflow helps\textquoteleftaccelerate the rate of scientific process\textquoteright and facilitates others to reproduce the given experiment provided that provenance of end-to-end process at every level is captured.We have implemented exemplar {GATK} variant calling workflow using three approaches to workflow definition namely Galaxy, {CWL} and Cpipe to identify assumptions implicit in these approaches. These assumptions lead to limited or no understanding of reproducibility requirements due to lack of documentation and comprehensive provenance tracking and resulted in identification of provenance information crucial for genomic workflows.{CWL} provides a declarative approach to workflow declaration making minimal assumptions about precise software environment, base software dependencies, configuration settings, alteration of parameters and software versions. It aims to provide an open source extensible standard to build flexible and customized workflows including intricate details of every process. It facilitates capture of information by supporting declaration of requirements, `cwl:tool` and checksums etc. Currently, there is no mechanism to gather the produced information as a result of a workflow run into one bundle for future use. We propose to demonstrate the implementation of a module for {CWL}.}
}
@inproceedings{chard_2016,
title = {I'll take that to go: Big data bags and minimal identifiers for exchange of large, complex datasets},
author = {Chard, Kyle and D'Arcy, Mike and Heavner, Ben and Foster, Ian and Kesselman, Carl and Madduri, Ravi and Rodriguez, Alexis and Soiland-Reyes, Stian and Goble, Carole and Clark, Kristi and Deutsch, Eric W. and Dinov, Ivo and Price, Nathan and Toga, Arthur},
pages = {319-328},
publisher = {IEEE},
year = {2016},
month = {dec},
day = {5},
urldate = {2018-07-13},
isbn = {978-1-4673-9005-7},
doi = {10.1109/bigdata.2016.7840618},
f1000-projects = {{CWLProv} and Your publications},
abstract = {Big data workflows often require the assembly and exchange of complex, multi-element datasets. For example, in biomedical applications, the input to an analytic pipeline can be a dataset consisting thousands of images and genome sequences assembled from diverse repositories, requiring a description of the contents of the dataset in a concise and unambiguous form. Typical approaches to creating datasets for big data workflows assume that all data reside in a single location, requiring costly data marshaling and permitting errors of omission and commission because dataset members are not explicitly specified. We address these issues by proposing simple methods and tools for assembling, sharing, and analyzing large and complex datasets that scientists can easily integrate into their daily workflows. These tools combine a simple and robust method for describing data collections ({BDBags}), data descriptions (Research Objects), and simple persistent identifiers (Minids) to create a powerful ecosystem of tools and services for big data analysis and sharing. We present these tools and use biomedical case studies to illustrate their use for the rapid assembly, sharing, and analysis of large datasets.},
booktitle = {2016 {IEEE} International Conference on Big Data (Big Data)}
}

@article{tavernaprov,
author = {Soiland-Reyes, Stian and Alper, Pinar and Goble, Carole},
title = {Tracking workflow execution with {TavernaProv}.},
note = {PROV Three Years Later; workshop at Provenance Week 2016, McLean, Virginia, USA.},
doi = {10.5281/zenodo.51314},
year = {2016},
month = {jun},
day = {6},
}
@book{nies_2014,
title = {{PROV}-Dictionary: Modeling Provenance for Dictionary Data Structures},
author = {Nies, Tom De and Coppens, Sam and Missier, Paolo and Moreau, Luc and Cheney, James and Lebo, Timothy and Soiland-Reyes, Stian},
publisher = {W3C},
year = {2014},
month = {apr},
day = {30},
urldate = {2017-08-16},
f1000-projects = {{CWLProv} and Your publications},
abstract = {Provenance is information about entities, activities, and people involved in producing a piece of data or thing, which can be used to form assessments about its quality, reliability or trustworthiness. This document describes extensions to {PROV} to facilitate the modeling of provenance for dictionary data structures. {PROV}-{DM} specifies a Collection as an entity that provides a structure to some constituents, which are themselves entities. However, some applications may need a mechanism to specify more structure to a Collection, in order to accurately describe its provenance. Therefore, in this document, we introduce Dictionary, a specific type of Collection with a logical structure consisting of key-entity pairs.}
}
@article{ciccarese_2013a,
title = {Web Annotation as a First-Class Object},
author = {Ciccarese, Paolo and Soiland-Reyes, Stian and Clark, Tim},
pages = {71-75},
year = {2013},
month = {nov},
urldate = {2018-07-13},
journal = {{IEEE} Internet Comput},
volume = {17},
number = {6},
issn = {1089-7801},
doi = {10.1109/{MIC}.2013.123},
f1000-projects = {{CWLProv} and Your publications}
}
@online{moreau_2013,
title = {{PROV}-N: The Provenance Notation},
author = {Moreau, Luc and Missier, Paolo and Cheney, James and Soiland-Reyes, Stian},
year = {2013},
month = {apr},
day = {30},
    url = {http://www.w3.org/TR/2013/REC-prov-n-20130430/},
    note = {W3C Recommendation 30 April 2013},
f1000-projects = {{CWLProv} and Your publications},
abstract = {Provenance is information about entities, activities, and people involved in producing a piece of data or thing, which can be used to form assessments about its quality, reliability or trustworthiness. {PROV}-{DM} is the conceptual data model that forms a basis for the {W3C} provenance ({PROV}) family of specifications. {PROV}-{DM} distinguishes core structures, forming the essence of provenance information, from extended structures catering for more specific uses of provenance. {PROV}-{DM} is organized in six components, respectively dealing with: (1) entities and activities, and the time at which they were created, used, or ended; (2) derivations of entities from entities; (3) agents bearing responsibility for entities that were generated and activities that happened; (4) a notion of bundle, a mechanism to support provenance of provenance; and, (5) properties to link entities that refer to the same thing; (6) collections forming a logical structure for its members.To provide examples of the {PROV} data model, the {PROV} notation ({PROV}-N) is introduced: aimed at human consumption, {PROV}-N allows serializations of {PROV} instances to be created in a compact manner. {PROV}-N facilitates the mapping of the {PROV} data model to concrete syntax, and is used as the basis for a formal semantics of {PROV}. The purpose of this document is to define the {PROV}-N notation.}
}

@online{PROVN,
author = {Moreau, Luc and Missier, Paolo and Cheney, James and Soiland-Reyes, Stian},
    title = {PROV-N: The Provenance Notation},
    url = {http://www.w3.org/TR/2013/REC-prov-n-20130430/},
    year = {2013},
    month = {apr},
    day = {30},
    urldate = {2018-09-22},
    note = {W3C Recommendation 30 April 2013}
}
 @article{wings2011,
author={Y. {Gil} and V. {Ratnakar} and J. {Kim} and P. {Gonzalez-Calero} and P. {Groth} and J. {Moody} and E. {Deelman}},
journal={IEEE Intelligent Systems},
title={Wings: Intelligent Workflow-Based Design of Computational Experiments},
year={2011},
volume={26},
number={1},
pages={62-72},
keywords={data analysis;natural sciences computing;planning (artificial intelligence);software architecture;workflow management software;workflow system;workflow creation;workflow execution;workflow instance generation and specialization;University of Southern California;Information Sciences Institute;workflow validation;Al planning;data set requirement;Computational intelligence;Tracking;Workflow management software;Design methodology;intelligent systems;workflow management;computational experiments;experiment design;software components;computer-supported discovery},
doi={10.1109/MIS.2010.9},
ISSN={1541-1672},
month={Jan},}

@inproceedings{belhajjame_2013,
title = {A workflow {PROV}-corpus based on Taverna and Wings},
author = {Belhajjame, Khalid and Zhao, Jun and Garijo, Daniel and Garrido, Aleix and Soiland-Reyes, Stian and Alper, Pinar and Corcho, Oscar},
pages = {331},
publisher = {{ACM} Press},
url = {http://dl.acm.org/citation.cfm?doid=2457317.2457376},
year = {2013},
month = {mar},
day = {18},
urldate = {2018-07-13},
isbn = {9781450315999},
doi = {10.1145/2457317.2457376},
address = {New York, New York, {USA}},
f1000-projects = {{CWLProv} and Your publications},
booktitle = {Proceedings of the Joint {EDBT}/{ICDT} 2013 Workshops on - {EDBT} '13}
}
@inproceedings{belhajjame_2012,
title = {Workflow-centric research objects: First class citizens in scholarly discourse},
author = {Belhajjame, Khalid and Corcho, Oscar and Garijo, Daniel and Zhao, Jun and Missier, Paolo and Newman, David and Palma, Raúl and Bechhofer, Sean and García Cuesta, Esteban and Gómez-Pérez, José Manuel and Klyne, Graham and Page, Kevin and Roos, Marco and Ruiz, José Enrique and Soiland-Reyes, Stian and Verdes-Montenegro, Lourdes and De Roure, David and Goble, Carole A.},
pages = {1-12},
year = {2012},
url = {http://ceur-ws.org/Vol-903/paper-01.pdf},
urldate = {2017-08-16},
booktitle = {Proceedings of the 2nd Workshop on Semantic Publishing ({SePublica 2012})},
journal = {{CEUR} Workshop Proceedings},
volume = {903},
issn = {1613-0073},
series = {{CEUR} Workshop Proceedings}
}

@inproceedings{belhajjame_2011,
title = {Fostering Scientific Workflow Preservation through Discovery of Substitute Services},
author = {Belhajjame, Khalid and Goble, Carole and Soiland-Reyes, Stian and De Roure, Davide},
pages = {97-104},
publisher = {IEEE},
year = {2011},
month = {dec},
day = {5},
urldate = {2018-07-13},
isbn = {978-1-4577-2163-2},
doi = {10.1109/eScience.2011.22},
f1000-projects = {{CWLProv} and Your publications},
booktitle = {2011 {IEEE} Seventh International Conference on {eScience}}
}
@incollection{missier_2010,
booktitle = {Scientific and statistical database management},
title = {Taverna, Reloaded},
author = {Missier, Paolo and Soiland-Reyes, Stian and Owen, Stuart and Tan, Wei and Nenadic, Alexandra and Dunlop, Ian and Williams, Alan and Oinn, Tom and Goble, Carole},
editor = {Gertz, Michael and Ludäscher, Bertram and Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
series = {Lecture notes in computer science},
pages = {471-481},
publisher = {Springer Berlin Heidelberg},
year = {2010},
urldate = {2017-08-16},
volume = {6187},
isbn = {978-3-642-13817-1},
issn = {0302-9743},
doi = {10.1007/978-3-642-13818-8\_33},
address = {Berlin, Heidelberg},
f1000-projects = {{CWLProv} and Your publications}
}
@article{garijo_2017,
title = {Abstract, link, publish, exploit: An end to end framework for workflow sharing},
author = {Garijo, D and Gil, Y and Corcho, O},
pages = {271-283},
year = {2017},
month = {oct},
urldate = {2018-07-13},
journal = {Future Generation Computer Systems},
volume = {75},
issn = {0167739X},
doi = {10.1016/j.future.2017.01.008},
f1000-projects = {{CWLProv} and Linked Data},
abstract = {Scientific workflows are increasingly used to manage and share scientific computations and methods to analyze data. A variety of systems have been developed that store the workflows executed and make them part of public repositories However, workflows are published in the idiosyncratic format of the workflow system used for the creation and execution of the workflows. Browsing, linking and using the stored workflows and their results often becomes a challenge for scientists who may only be familiar with one system. In this paper we present an approach for addressing this issue by publishing and exploiting workflows as data on the Web with a representation that is independent from the workflow system used to create them. In order to achieve our goal, we follow the Linked Data Principles to publish workflow inputs, intermediate results, outputs and codes; and we reuse and extend well established standards like {W3C} {PROV}. We illustrate our approach by publishing workflows and consuming them with different tools designed to address common scenarios for workflow exploitation.}
}
@article{cohen2017scientific,
title = {Scientific workflows for computational reproducibility in the life sciences: Status, challenges and opportunities},
author = {Cohen-Boulakia, Sarah and Belhajjame, Khalid and Collin, Olivier and Chopard, Jérôme and Froidevaux, Christine and Gaignard, Alban and Hinsen, Konrad and Larmande, Pierre and Bras, Yvan Le and Lemoine, Frédéric and Mareuil, Fabien and Ménager, Hervé and Pradal, Christophe and Blanchet, Christophe},
pages = {284-298},
year = {2017},
month = {oct},
urldate = {2018-07-13},
journal = {Future Generation Computer Systems},
volume = {75},
issn = {0167739X},
doi = {10.1016/j.future.2017.01.012},
f1000-projects = {{CWL} and {CWLProv} and Workflows},
abstract = {With the development of new experimental technologies, biologists are faced with an avalanche of data to be computationally analyzed for scientific advancements and discoveries to emerge. Faced with the complexity of analysis pipelines, the large number of computational tools, and the enormous amount of data to manage, there is compelling evidence that many if not most scientific discoveries will not stand the test of time: increasing the reproducibility of computed results is of paramount importance. The objective we set out in this paper is to place scientific workflows in the context of reproducibility. To do so, we define several kinds of reproducibility that can be reached when scientific workflows are used to perform experiments. We characterize and define the criteria that need to be catered for by reproducibility-friendly scientific workflow systems, and use such criteria to place several representative and widely used workflow systems and companion tools within such a framework. We also discuss the remaining challenges posed by reproducible scientific workflows in the life sciences. Our study was guided by three use cases from the life science domain involving in silico experiments.}
}
@inproceedings{dahuo_2015,
    title = {Smart Container: an ontology towards conceptualizing Docker},
    author = {Da Huo, and Nabrzyski, Jaroslaw and {II}, Charles},
    series = {{CEUR} Workshop Proceedings},
    url = {http://ceur-ws.org/Vol-1486/paper\_89.pdf},
    year = {2015},
    month = {oct},
    day = {11},
    urldate = {2017-10-04},
    volume = {1486},
    issn = {1613-0073},
    f1000-projects = {{CWLProv} and Linked Data and Reproducibility},
    booktitle = {Proceedings of the {ISWC} 2015 Posters \& Demonstrations Track. Co-located with the 14th International Semantic Web Conference ({ISWC}-2015)}
}
@misc{robinson_other_2017,
    title = {{CWL} Viewer: The Common Workflow Language viewer},
    author = {Robinson, Mark and Soiland-Reyes, Stian and Crusoe, Michael R and Goble, Carole},
    year = {2017},
    month = {jul},
    day = {22},
    urldate = {2017-11-03},
    volume = {6},
    f1000-projects = {{CWL} and {CWLProv} and Debianpaper},
    type = {OTHER}
}
@article{kanwal_2017,
    title = {Investigating reproducibility and tracking provenance - A genomic workflow case study.},
    author = {Kanwal, Sehrish and Khan, Farah Zaib and Lonie, Andrew and Sinnott, Richard O},
    pages = {337},
    year = {2017},
    month = {jul},
    day = {12},
    urldate = {2017-11-03},
    journal = {{BMC} Bioinformatics},
    volume = {18},
    number = {1},
    doi = {10.1186/s12859-017-1747-0},
    pmid = {28701218},
    pmcid = {PMC5508699}
}

@article{belhajjame_2015,
    title = {Using a suite of ontologies for preserving workflow-centric research objects},
    author = {Belhajjame, Khalid and Zhao, Jun and Garijo, Daniel and Gamble, Matthew and Hettne, Kristina and Palma, Raul and Mina, Eleni and Corcho, Oscar and Gómez-Pérez, José Manuel and Bechhofer, Sean and Klyne, Graham and Goble, Carole},
    pages = {16-42},
    year = {2015},
    month = {may},
    urldate = {2018-07-13},
    journal = {Web Semantics: Science, Services and Agents on the World Wide Web},
    volume = {32},
    number = {0},
    issn = {15708268},
    doi = {10.1016/j.websem.2015.01.003},
    f1000-projects = {{CWL} and {CWLProv}},
    abstract = {Scientific workflows are a popular mechanism for specifying and automating data-driven in silico experiments. A significant aspect of their value lies in their potential to be reused. Once shared, workflows become useful building blocks that can be combined or modified for developing new experiments. However, previous studies have shown that storing workflow specifications alone is not sufficient to ensure that they can be successfully reused, without being able to understand what the workflows aim to achieve or to re-enact them. To gain an understanding of the workflow, and how it may be used and repurposed for their needs, scientists require access to additional resources such as annotations describing the workflow, datasets used and produced by the workflow, and provenance traces recording workflow executions.In this article, we present a novel approach to the preservation of scientific workflows through the application of research objects-aggregations of data and metadata that enrich the workflow specifications. Our approach is realised as a suite of ontologies that support the creation of workflow-centric research objects. Their design was guided by requirements elicited from previous empirical analyses of workflow decay and repair. The ontologies developed make use of and extend existing well known ontologies, namely the Object Reuse and Exchange ({ORE}) vocabulary, the Annotation Ontology ({AO}) and the {W3C} {PROV} ontology ({PROVO}). We illustrate the application of the ontologies for building Workflow Research Objects with a case-study that investigates Huntington's disease, performed in collaboration with a team from the Leiden University Medial Centre ({HG}-{LUMC}). Finally we present a number of tools developed for creating and managing workflow-centric research objects.}
}
@article{moller_2017,
title = {Robust Cross-Platform Workflows: How Technical and Scientific Communities Collaborate to Develop, Test and Share Best Practices for Data Analysis},
author = {Möller, Steffen and Prescott, Stuart W. and Wirzenius, Lars and Reinholdtsen, Petter and Chapman, Brad and Prins, Pjotr and Soiland-Reyes, Stian and Klötzl, Fabian and Bagnacani, Andrea and Kalaš, Matúš and Tille, Andreas and Crusoe, Michael R.},
pages = {232-244},
year = {2017},
month = {nov},
day = {16},
urldate = {2018-07-13},
journal = {Data Sci. Eng.},
volume = {2},
number = {3},
issn = {2364-1185},
doi = {10.1007/s41019-017-0050-4},
f1000-projects = {CWLProv},
abstract = {Information integration and workflow technologies for data analysis have always been major fields of investigation in bioinformatics. A range of popular workflow suites are available to support analyses in computational biology. Commercial providers tend to offer prepared applications remote to their clients. However, for most academic environments with local expertise, novel data collection techniques or novel data analysis, it is essential to have all the flexibility of open-source tools and open-source workflow descriptions. Workflows in data-driven science such as computational biology have considerably gained in complexity. New tools or new releases with additional features arrive at an enormous pace, and new reference data or concepts for quality control are emerging. A well-abstracted workflow and the exchange of the same across work groups have an enormous impact on the efficiency of research and the further development of the field. High-throughput sequencing adds to the avalanche of data available in the field; efficient computation and, in particular, parallel execution motivate the transition from traditional scripts and Makefiles to workflows. We here review the extant software development and distribution model with a focus on the role of integration testing and discuss the effect of common workflow language on distributions of open-source scientific software to swiftly and reliably provide the tools demanded for the execution of such formally described workflows. It is contended that, alleviated from technical differences for the execution on local machines, clusters or the cloud, communities also gain the technical means to test workflow-driven interaction across several software packages.}
}
@article{steinbaugh_2017,
title = {{bcbioRNASeq}: R package for bcbio {RNA}-seq analysis},
author = {Steinbaugh, Michael J. and Pantano, Lorena and Kirchner, Rory D. and Barrera, Victor and Chapman, Brad A. and Piper, Mary E. and Mistry, Meeta and Khetani, Radhika S. and Rutherford, Kayleigh D. and Hofmann, Oliver and Hutchinson, John N. and Ho Sui, Shannan},
pages = {1976},
year = {2017},
month = {nov},
day = {8},
urldate = {2018-07-23},
journal = {F1000Res},
volume = {6},
doi = {10.12688/f1000research.12093.1},
f1000-projects = {CWLProv},
abstract = {{RNA}-seq analysis involves multiple steps from processing raw sequencing data to identifying, organizing, annotating, and reporting differentially expressed genes. bcbio is an open source, community-maintained framework providing automated and scalable {RNA}-seq methods for identifying gene abundance counts. We have developed {bcbioRNASeq}, a Bioconductor package that provides ready-to-render templates and wrapper functions to post-process bcbio output data. {bcbioRNASeq} automates the generation of high-level {RNA}-seq reports, including identification of differentially expressed genes, functional enrichment analysis and quality control analysis.}
}
@article{hillion_2017,
title = {Using bio.tools to generate and annotate workbench tool descriptions.},
author = {Hillion, Kenzo-Hugo and Kuzmin, Ivan and Khodak, Anton and Rasche, Eric and Crusoe, Michael and Peterson, Hedi and Ison, Jon and Ménager, Hervé},
year = {2017},
month = {nov},
day = {30},
urldate = {2018-07-13},
journal = {F1000Res},
volume = {6},
doi = {10.12688/f1000research.12974.1},
pmid = {29333231},
pmcid = {PMC5747335},
f1000-projects = {CWLProv},
abstract = {Workbench and workflow systems such as Galaxy, Taverna, Chipster, or Common Workflow Language ({CWL})-based frameworks, facilitate the access to bioinformatics tools in a user-friendly, scalable and reproducible way. Still, the integration of tools in such environments remains a cumbersome, time consuming and error-prone process. A major consequence is the incomplete or outdated description of tools that are often missing important information, including parameters and metadata such as publication or links to documentation. {ToolDog} (Tool {DescriptiOn} Generator) facilitates the integration of tools - which have been registered in the {ELIXIR} tools registry (https://bio.tools) - into workbench environments by generating tool description templates. {ToolDog} includes two modules. The first module analyses the source code of the bioinformatics software with language-specific plugins, and generates a skeleton for a Galaxy {XML} or {CWL} tool description. The second module is dedicated to the enrichment of the generated tool description, using metadata provided by bio.tools. This last module can also be used on its own to complete or correct existing tool descriptions with missing metadata.}
}
@article{atkinson_2017,
title = {Scientific workflows: Past, present and future},
author = {Atkinson, Malcolm and Gesing, Sandra and Montagnat, Johan and Taylor, Ian},
pages = {216-227},
year = {2017},
month = {oct},
urldate = {2018-07-13},
journal = {Future Generation Computer Systems},
volume = {75},
issn = {0167739X},
doi = {10.1016/j.future.2017.05.041},
f1000-projects = {{CWLProv} and Workflows},
abstract = {This special issue and our editorial celebrate 10 years of progress with data-intensive or scientific workflows. There have been very substantial advances in the representation of workflows and in the engineering of workflow management systems ({WMS}). The creation and refinement stages are now well supported, with a significant improvement in usability. Improved abstraction supports cross-fertilisation between different workflow communities and consistent interpretation as {WMS} evolve. Through such re-engineering the {WMS} deliver much improved performance, significantly increased scale and sophisticated reliability mechanisms. Further improvement is anticipated from substantial advances in optimisation. We invited papers from those who have delivered these advances and selected 14 to represent today\textquoterights achievements and representative plans for future progress. This editorial introduces those contributions with an overview and categorisation of the papers. Furthermore, it elucidates responses from a survey of major workflow systems, which provides evidence of substantial progress and a structured index of related papers. We conclude with suggestions on areas where further research and development is needed and offer a vision of future research directions.}
}
@article{cuevasvicenttn_2012,
title = {Scientific workflows and provenance: introduction and research opportunities},
author = {Cuevas-Vicenttín, Víctor and Dey, Saumen and Köhler, Sven and Riddle, Sean and Ludäscher, Bertram},
pages = {193-203},
year = {2012},
month = {nov},
urldate = {2018-03-16},
journal = {Datenbank Spektrum},
volume = {12},
number = {3},
issn = {1618-2162},
doi = {10.1007/s13222-012-0100-z},
f1000-projects = {{CWLProv} and Workflows}
}
@incollection{giesler_2017,
booktitle = {High-Performance Scientific Computing},
title = {Uniprov: A flexible provenance tracking system for {UNICORE}},
author = {Giesler, André and Czekala, Myriam and Hagemeier, Björn and Grunzke, Richard},
editor = {Di Napoli, Edoardo and Hermanns, Marc-André and Iliev, Hristo and Lintermann, Andreas and Peyser, Alexander},
series = {Lecture notes in computer science},
pages = {233-242},
publisher = {Springer International Publishing},
year = {2017},
volume = {10164},
isbn = {978-3-319-53861-7},
issn = {0302-9743},
doi = {10.1007/978-3-319-53862-4\_20},
address = {Cham},
f1000-projects = {{CWLProv} and Research Object},
abstract = {In this paper we present a flexible provenance management system called {UniProv}. {UniProv} is an ongoing development project providing provenance tracking in scientific workflows and data management particularly in the field of neuroscience, thus allowing users to validate and reproduce tasks and results of their experiments.The primary goal is to equip the commonly used Grid middleware {UNICORE} [1] and its incorporated workflow engine with the provenance capturing mechanism of {UniProv}. We also explain an approach for using predefined patterns to ensure compatibility with the {W3C} {PROV} [2] Data Model and to map the provenance information properly to a neo4j graph database.}
}
@inproceedings{gomezperez_2017,
title = {Towards a Human-Machine Scientific Partnership Based on Semantically Rich Research Objects},
author = {Gomez-Perez, Jose Manuel and Palma, Raul and Garcia-Silva, Andres},
pages = {266-275},
publisher = {IEEE},
year = {2017},
month = {oct},
day = {24},
urldate = {2018-04-30},
isbn = {978-1-5386-2686-3},
doi = {10.1109/eScience.2017.40},
f1000-projects = {{CWLProv} and Research Object},
abstract = {A research object is a single information unit encapsulating all the knowledge relevant to a particular scientific investigation, their associated metadata and the context where such resources were produced and came into play. Aimed at enhancing the preservation, reuse and scholarly communication of data-intensive science, research objects are both technical and social artifacts that represent a partnership between scientific communities and the computational support required in nowadays science. In this paper, we explore such partnership, identifying the lack of appropriate machine-readable metadata as one of its main inhibitors, and address the semantic enrichment of research objects as one key aspect towards its establishment. Focused on the specific needs of Earth Science communities, we propose extensions to research object representation models and present novel methods and tools to enrich research object metadata through automatic means. Finally, we validate the approach through the implementation of a recommender system that exploits the resulting metadata to facilitate research object discovery and reuse, enabling humans and machines to work together and accelerate the research life cycle.},
booktitle = {2017 {IEEE} 13th International Conference on e-Science (e-Science)}
}
@article{madduri_2018,
    author = {Madduri, Ravi AND Chard, Kyle AND D’Arcy, Mike AND Jung, Segun C. AND Rodriguez, Alexis AND Sulakhe, Dinanath AND Deutsch, Eric AND Funk, Cory AND Heavner, Ben AND Richards, Matthew AND Shannon, Paul AND Glusman, Gustavo AND Price, Nathan AND Kesselman, Carl AND Foster, Ian},
    journal = {PLOS ONE},
    publisher = {Public Library of Science},
    title = {Reproducible big data science: A case study in continuous FAIRness},
    year = {2019},
    month = {04},
    volume = {14},
    url = {https://doi.org/10.1371/journal.pone.0213013},
    pages = {1-22},
    abstract = {Big biomedical data create exciting opportunities for discovery, but make it difficult to capture analyses and outputs in forms that are findable, accessible, interoperable, and reusable (FAIR). In response, we describe tools that make it easy to capture, and assign identifiers to, data and code throughout the data lifecycle. We illustrate the use of these tools via a case study involving a multi-step analysis that creates an atlas of putative transcription factor binding sites from terabytes of ENCODE DNase I hypersensitive sites sequencing data. We show how the tools automate routine but complex tasks, capture analysis algorithms in understandable and reusable forms, and harness fast networks and powerful cloud computers to process data rapidly, all without sacrificing usability or reproducibility—thus ensuring that big data are not hard-to-(re)use data. We evaluate our approach via a user study, and show that 91\% of participants were able to replicate a complex analysis involving considerable data volumes.},
    number = {4},
    doi = {10.1371/journal.pone.0213013}
}
@article{schloss_2018,
title = {Identifying and overcoming threats to reproducibility, replicability, robustness, and generalizability in microbiome research.},
author = {Schloss, Patrick D},
url = {http://mbio.asm.org/lookup/doi/10.1128/{mBio}.00525-18},
year = {2018},
month = {jun},
day = {5},
urldate = {2018-07-13},
journal = {MBio},
volume = {9},
number = {3},
issn = {2150-7511},
doi = {10.1128/{mBio}.00525-18},
pmid = {29871915},
pmcid = {PMC5989067},
f1000-projects = {CWLProv},
abstract = {The "reproducibility crisis" in science affects microbiology as much as any other area of inquiry, and microbiologists have long struggled to make their research reproducible. We need to respect that ensuring that our methods and results are sufficiently transparent is difficult. This difficulty is compounded in interdisciplinary fields such as microbiome research. There are many reasons why a researcher is unable to reproduce a previous result, and even if a result is reproducible, it may not be correct. Furthermore, failures to reproduce previous results have much to teach us about the scientific process and microbial life itself. This Perspective delineates a framework for identifying and overcoming threats to reproducibility, replicability, robustness, and generalizability of microbiome research. Instead of seeing signs of a crisis in others' work, we need to appreciate the technical and social difficulties that limit reproducibility in the work of others as well as our own. Copyright \copyright 2018 Schloss.}
}
@article{grning_2018,
title = {Bioconda: sustainable and comprehensive software distribution for the life sciences.},
author = {Grüning, Björn and Dale, Ryan and Sjödin, Andreas and Chapman, Brad A and Rowe, Jillian and Tomkins-Tinch, Christopher H and Valieris, Renan and Köster, Johannes and Bioconda Team},
pages = {475-476},
url = {http://www.nature.com/articles/s41592-018-0046-7},
year = {2018},
month = {jul},
urldate = {2018-07-13},
journal = {Nat Methods},
volume = {15},
number = {7},
issn = {1548-7091},
doi = {10.1038/s41592-018-0046-7},
pmid = {29967506},
f1000-projects = {{CWLProv} and Workflows}
}
@article{grning_2018a,
title = {Practical computational reproducibility in the life sciences.},
author = {Grüning, Björn and Chilton, John and Köster, Johannes and Dale, Ryan and Soranzo, Nicola and van den Beek, Marius and Goecks, Jeremy and Backofen, Rolf and Nekrutenko, Anton and Taylor, James},
pages = {631-635},
year = {2018},
month = {jun},
day = {27},
urldate = {2018-07-13},
journal = {Cell Syst},
volume = {6},
number = {6},
issn = {24054712},
doi = {10.1016/j.cels.2018.03.014},
pmid = {29953862},
f1000-projects = {CWLProv},
abstract = {Many areas of research suffer from poor reproducibility, particularly in computationally intensive domains where results rely on a series of complex methodological decisions that are not well captured by traditional publication approaches. Various guidelines have emerged for achieving reproducibility, but implementation of these practices remains difficult due to the challenge of assembling software tools plus associated libraries, connecting tools together into pipelines, and specifying parameters. Here, we discuss a suite of cutting-edge technologies that make computational reproducibility not just possible, but practical in both time and effort. This suite combines three well-tested components-a system for building highly portable packages of bioinformatics software, containerization and virtualization technologies for isolating reusable execution environments for these packages, and workflow systems that automatically orchestrate the composition of these packages for entire pipelines-to achieve an unprecedented level of computational reproducibility. We also provide a practical implementation and five recommendations to help set a typical researcher on the path to performing data analyses reproducibly. Copyright \copyright 2018 Elsevier Inc. All rights reserved.}
}
@article{jeffrey_2014,
title = {{SciCrunch}: A cooperative and collaborative data and resource discovery platform for scientific communities},
author = {Jeffrey, Grethe and Anita, Bandrowski and Davis, Banks and Christopher, Condit and Amarnath, Gupta and Stephen, Larson and Yueling, Li and Ibrahim, Ozyurt and Andrea, Stagg and Patricia, Whetzel and Luis, Marenco and Perry, Miller and Rixin, Wang and Gordon, Shepherd and Maryann, Martone},
url = {http://www.frontiersin.org/Community/{AbstractDetails}.aspx?{ABS\_DOI}=10.3389/conf.fninf.2014.18.00069},
year = {2014},
urldate = {2018-07-13},
journal = {Front Neuroinformatics},
volume = {8},
issn = {1662-5196},
doi = {10.3389/conf.fninf.2014.18.00069},
f1000-projects = {CWLProv}
}

@inproceedings{garijo_2014,
    title = {Towards Workflow Ecosystems through Semantic and Standard Representations},
    author = {Garijo, Daniel and Gil, Yolanda and Corcho, Oscar},
    pages = {94-104},
    publisher = {IEEE},
    year = {2014},
    month = {nov},
    day = {16},
    urldate = {2018-07-13},
    isbn = {978-1-4799-7067-4},
    doi = {10.1109/{WORKS}.2014.13},
    booktitle = {2014 9th Workshop on Workflows in Support of Large-Scale Science}
}

@inproceedings{fernando_2007,
title = {Towards Build-Time Interoperability of Workflow Definition Languages},
author = {Fernando, Sarah D. Induruwa and Creager, Douglas A. and Simpson, Andrew C.},
pages = {525-532},
publisher = {IEEE},
year = {2007},
month = {sep},
day = {26},
urldate = {2018-07-13},
doi = {10.1109/{SYNASC}.2007.18},
f1000-projects = {CWLProv},
booktitle = {Ninth International Symposium on Symbolic and Numeric Algorithms for Scientific Computing ({SYNASC} 2007)}
}
@inproceedings{lynch_2007,
title = {The {OAI}-{ORE} effort: Progress, challenges, synergies},
author = {Lynch, Cliff and Parastatidis, Savas and Jacobs, Neil and Van de Sompel, Herbert and Lagoze, Carl},
pages = {80},
publisher = {{ACM} Press},
url = {http://portal.acm.org/citation.cfm?doid=1255175.1255190},
year = {2007},
month = {jun},
day = {18},
urldate = {2018-07-15},
isbn = {9781595936448},
doi = {10.1145/1255175.1255190},
address = {New York, New York, {USA}},
f1000-projects = {CWLProv},
booktitle = {Proceedings of the 2007 conference on Digital libraries - {JCDL} '07}
}
@techreport{farrell_2013,
title = {Naming Things with Hashes},
author = {Farrell, S. and Kutscher, D. and Dannewitz, C. and Ohlman, B. and Keranen, A. and Hallam-Baker, P.},
publisher = {{RFC} Editor},
url = {https://www.rfc-editor.org/info/rfc6920},
year = {2013},
month = {apr},
urldate = {2018-07-15},
doi = {10.17487/rfc6920},
f1000-projects = {CWLProv}
}
@online{soilandreyes_2014,
title = {Research Object Bundle 1.0},
author = {Soiland-Reyes, Stian and Gamble, Matthew and Haines, Robert},
publisher = {researchobject.org},
doi = {10.5281/zenodo.12586},
publisher = {Zenodo},
year = {2014},
month = {nov},
day = {5},
urldate = {2018-07-13},
f1000-projects = {CWLProv},
note = {researchobject.org Specification}
}

@article{cwlprov-preprint,
    title = {{CWLProv} - Interoperable Retrospective Provenance capture and its challenges},
    author = {Khan, Farah Zaib and Soiland-Reyes, Stian and Crusoe, Michael R. and Lonie, Andrew and Sinnott, Richard},
    publisher = {Zenodo},
    doi = {10.5281/zenodo.1215611},
    year = {2018},
    note = {Zenodo preprint},
    month = {mar},
    day = {27},
    urldate = {2018-07-13},
    f1000-projects = {CWLProv},
}
@article{soilandreyes_2018,
    title = {The Archive and Package (arcp) {URI} scheme},
    author = {Soiland-Reyes, Stian and Cáceres, Marcos},
    year = {2018},
    month = {jan},
    day = {17},
    url = {https://tools.ietf.org/id/draft-soilandreyes-arcp-03.html},
    urldate = {2018-07-13},
    journal = {Internet-Draft soilandreyes-arcp },
    f1000-projects = {CWLProv},
    abstract = {This specification define the Archive and Package {URI} scheme arcp.arcp {URIs} can be used to consume or reference hypermedia resources bundled inside a file archive or an application package, as well as to resolve {URIs} for archive resources within a programmatic framework.This {URI} scheme provides mechanisms to generate a unique base {URI} to represent the root of the archive, so that relative {URI} references in a bundled resource can be resolved within the archive without having to extract the archive content on the local file system.An arcp {URI} can be used for purposes of isolation (e.g. when consuming multiple archives), security constraints (avoiding out\textquotedblrightfrom the archive), or for externally identiyfing sub-resources referenced by hypermedia formats.}
}

@misc{belhajjame_film_2014,
title = {A workflow {PROV}-corpus based on Taverna and Wings (Datasets)},
author = {Belhajjame, Khalid and Zhao, Jun and Garijo, Daniel and Garrido, Aleix and Soiland-Reyes, Stian and Alper, Pinar and Corcho, Óscar},
url = {https://github.com/provbench/{Wf4Ever}-{PROV}},
year = {2014},
month = {sep},
day = {1},
urldate = {2018-07-13},
f1000-projects = {CWLProv},
abstract = {This release contains provenance traces that have been obtained by running Taverna and Wing workflows. As well as the traces, the release contains the specifications of the workflows that were ran.},
type = {FILM}
}
@article{robinson_2017b,
title = {{CWL} Viewer: the common workflow language viewer},
author = {Robinson, Mark and Soiland-Reyes, Stian and Crusoe, Michael R and Goble, Carole},
pages = {1075 (poster)},
url = {https://doi.org/10.7490/f1000research.1114375.1},
year = {2017},
month = {jul},
day = {6},
urldate = {2018-07-13},
journal = {F1000Research},
volume = {6},
number = {{ISCB} Comm J},
f1000-projects = {CWLProv}
}
@inproceedings{garijo_2012,
title = {Augmenting {PROV} with Plans in P-{PLAN}: Scientific Processes as Linked Data},
author = {Garijo, Daniel and Gil, Yolanda},
series = {{CEUR} Workshop Proceedings},
url = {http://ceur-ws.org/Vol-951/paper6.pdf},
year = {2012},
month = {nov},
day = {12},
urldate = {2018-07-13},
volume = {951},
issn = {1613-0073},
series = {{CEUR} Workshop Proceedings},
f1000-projects = {CWLProv},
booktitle = {Proceedings of the Second International Workshop on Linked Science 2012 - Tackling Big Data}
}
@techreport{kunze_2018,
title = {The {BagIt} File Packaging Format (V1.0)},
author = {Kunze, John A. and Littman, Justin and Madden, Liz and Scancella, John and Adams, Chris},
publisher = {Internet Engineering Task Force},
url = {https://datatracker.ietf.org/doc/html/draft-kunze-bagit-16},
year = {2018},
month = {jun},
day = {4},
urldate = {2018-07-13},
f1000-projects = {CWLProv},
abstract = {This document describes {BagIt}, a set of hierarchical file layout conventions for storage and transfer of arbitrary digital content. A "bag" has just enough structure to enclose descriptive metadata "tags" and a file "payload" but does not require knowledge of the payload\textquoterights internal semantics. This {BagIt} format is suitable for reliable storage and transfer.},
type = {Internet-Draft}
}
@techreport{researchdatarepositoryinteroperabilitywg_2018,
title = {Research Data Repository Interoperability {WG} Final Recommendations},
author = {Research Data Repository Interoperability {WG},},
publisher = {Research Data Alliance},
url = {https://doi.org/10.15497/{RDA00025}},
year = {2018},
month = {jun},
day = {21},
urldate = {2018-07-13},
f1000-projects = {CWLProv}
}
@techreport{sanderson_2017,
title = {Web Annotation Data Model},
author = {Sanderson, Robert and Ciccarese, Paolo and Young, Benjamin},
series = {{W3C} Recommendation},
publisher = {World Wide Web Consortium},
url = {https://www.w3.org/{TR}/2017/{REC}-annotation-model-20170223/},
year = {2017},
month = {feb},
day = {23},
urldate = {2018-07-13},
f1000-projects = {CWLProv},
type = {{W3C} Recommendation}
}

@online{huynh_2013,
title = {The {PROV}-{JSON} Serialization. A {JSON} Representation for the {PROV} Data Model},
author = {Huynh, Trung Dong and Jewell, Michael and Keshavarz, Amir Sezavar and Michaelides, Danius and Yang, Huanjia and Moreau, Luc},
year = {2013},
month = {Apr},
day = {24},
note = {W3C Member Submission 24 April 2013},
url = {http://www.w3.org/Submission/2013/SUBM-prov-json-20130424/},
urldate = {2018-07-23},
f1000-projects = {CWLProv}
}
@article{garijo_2017a,
title = {Abstract, link, publish, exploit: An end to end framework for workflow sharing},
author = {Garijo, Daniel and Gil, Yolanda and Corcho, Oscar},
pages = {271-283},
year = {2017},
urldate = {2018-07-23},
journal = {Future Gener. Comput. Syst.},
volume = {75},
f1000-projects = {CWLProv}
}
@article{clifford_2008,
title = {Tracking provenance in a virtual data grid},
author = {Clifford, Ben and Foster, Ian and Voeckler, Jens-S and Wilde, Michael and Zhao, Yong},
pages = {565-575},
doi = {10.1002/cpe.1256},
publisher = {John Wiley \& Sons, Ltd.},
year = {2008},
month = {apr},
journal = {Concurrency and Computation: Practice and Experience},
volume = {20},
number = {5},
f1000-projects = {CWLProv},
abstract = {The virtual data model allows data sets to be described prior to, and separately from, their physical materialization. We have implemented this model in a Virtual Data Language ({VDL}) and associated supporting tools, which provide for both the storage, query, and retrieval of virtual data set descriptions, and the automated, on-demand materialization of virtual data sets. We use a standardized data provenance challenge exercise to illustrate the powerful queries that can be performed on the data maintained by these tools, which for a single virtual data set can include three elements: the computational procedure(s) that must be executed to materialize the data set, the runtime log(s) produced by the execution of the computation(s), and optional metadata annotation(s) that associate application semantics with data and procedures. Copyright \copyright 2007 John Wiley \& Sons, Ltd.}
}
@article{leipzig_2017a,
title = {A review of bioinformatic pipeline frameworks},
author = {Leipzig, Jeremy},
pages = {530-536},
year = {2017},
month = {may},
urldate = {2018-07-23},
journal = {Brief. Bioinform.},
volume = {18},
number = {3},
f1000-projects = {CWLProv},
abstract = {High-throughput bioinformatic analyses increasingly rely on pipeline frameworks to process sequence and metadata. Modern implementations of these frameworks differ on three key dimensions: using an implicit or explicit syntax, using a configuration, convention or class-based design paradigm and offering a command line or workbench interface. Here I survey and compare the design philosophies of several current pipeline frameworks. I provide practical recommendations based on analysis requirements and the user base.}
}
@inproceedings{chirigati_2013,
title = {{VisTrails} Provenance Traces for Benchmarking},
author = {Chirigati, Fernando and Freire, Juliana and Koop, David and Silva, Cláudio},
series = {{EDBT} '13},
pages = {323-324},
publisher = {ACM},
year = {2013},
urldate = {2018-07-23},
address = {New York, {NY}, {USA}},
f1000-projects = {CWLProv},
booktitle = {Proceedings of the Joint {EDBT}/{ICDT} 2013 Workshops}
}
@inproceedings{missier_2013,
title = {The {W3C} {PROV} Family of Specifications for Modelling Provenance Metadata},
author = {Missier, Paolo and Belhajjame, Khalid and Cheney, James},
series = {{EDBT} '13},
pages = {773-776},
publisher = {ACM},
year = {2013},
urldate = {2018-07-23},
address = {New York, {NY}, {USA}},
f1000-projects = {CWLProv},
booktitle = {Proceedings of the 16th International Conference on Extending Database Technology}
}
@misc{,
title = {{Wf4Ever} Research Object Model},
urldate = {2018-07-23},
f1000-projects = {CWLProv},
type = {OTHER}
}
@article{a,
title = {Service Management:: Operations, Strategy and Information Technology, 2nd edition},
pages = {3-4},
year = {1999},
urldate = {2018-07-23},
journal = {International Journal of Service Industry Management},
volume = {10},
number = {2},
f1000-projects = {CWLProv}
}
@article{benabdelkader_2015,
title = {{PROV}-man: A {PROV}-compliant toolkit for provenance management},
author = {Benabdelkader, Ammar and {VanKampen}, Antoine Ahc and Olabarriaga, Silvia D},
publisher = {{PeerJ}, Inc.},
year = {2015},
volume = {3},
number = {e1102v1},
 doi={10.7287/peerj.preprints.1102v1},
journal = {{PeerJ} {PrePrints}},
f1000-projects = {CWLProv}
}
@inproceedings{pasquier_2017,
title = {Practical whole-system provenance capture},
author = {Pasquier, Thomas and Han, Xueyuan and Goldstein, Mark and Moyer, Thomas and Eyers, David and Seltzer, Margo and Bacon, Jean},
pages = {405-418},
publisher = {{ACM} Press},
year = {2017},
month = {sep},
day = {24},
urldate = {2018-07-23},
isbn = {9781450350280},
doi = {10.1145/3127479.3129249},
address = {New York, New York, {USA}},
f1000-projects = {CWLProv},
abstract = {Data provenance describes how data came to be in its present form. It includes data sources and the transformations that have been applied to them. Data provenance has many uses, from forensics and security to aiding the reproducibility of scientific experiments. We present {CamFlow}, a whole-system provenance capture mechanism that integrates easily into a {PaaS} offering. While there have been several prior whole-system provenance systems that captured a comprehensive, systemic and ubiquitous record of a system's behavior, none have been widely adopted. They either A) impose too much overhead, B) are designed for long-outdated kernel releases and are hard to port to current systems, C) generate too much data, or D) are designed for a single system. {CamFlow} addresses these shortcoming by: 1) leveraging the latest kernel design advances to achieve efficiency; 2) using a self-contained, easily maintainable implementation relying on a Linux Security Module, {NetFilter}, and other existing kernel facilities; 3) providing a mechanism to tailor the captured provenance data to the needs of the application; and 4) making it easy to integrate provenance across distributed systems. The provenance we capture is streamed and consumed by tenant-built auditor applications. We illustrate the usability of our implementation by describing three such applications: demonstrating compliance with data regulations; performing fault/intrusion detection; and implementing data loss prevention. We also show how {CamFlow} can be leveraged to capture meaningful provenance without modifying existing applications.},
booktitle = {Proceedings of the 2017 Symposium on Cloud Computing - {SoCC} '17}
}
@article{merkel_2014,
title = {Docker: Lightweight Linux Containers for Consistent Development and Deployment},
author = {Merkel, Dirk},
publisher = {Belltown Media},
year = {2014},
month = {mar},
urldate = {2018-07-23},
journal = {Linux J.},
volume = {2014},
number = {239},
address = {Houston, {TX}},
f1000-projects = {CWLProv},
abstract = {\textellipsis The client libraries are great for accessing the d\{\textbackslashae\}mon programmatically, but the more common use case is to issue \textellipsis Third, the Docker binary functions as a client to remote repositories of images. Tagged images that make up the filesystem for a container are called repositories \textellipsis}
}

@article{william_2017,
title = {Distributing Data and Analysis Software Containers For Better Data Sharing in Clinical Research},
author = {William, A and Furmanek, Stephen and Sinclair, Christopher M and Timothy, L},
pages = {6},
year = {2017},
urldate = {2018-07-23},
journal = {The University of Louisville Journal of Respiratory Infections},
volume = {1},
number = {4},
f1000-projects = {CWLProv}
}
@inproceedings{deroure_2011,
title = {Towards the preservation of scientific workflows},
author = {De Roure, David and Belhajjame, Khalid and Missier, Paolo and Gómez-Pérez, José Manuel and Palma, Raúl and Ruiz, José Enrique and Hettne, Kristina and Roos, Marco and Klyne, Graham and Goble, Carole},
year = {2011},
urldate = {2018-07-23},
f1000-projects = {CWLProv},
booktitle = {Procs. of the 8th International Conference on Preservation of Digital Objects ({iPRES} 2011). {ACM}}
}

@incollection{michaelides_2016,
booktitle = {Provenance and annotation of data and processes},
title = {Intermediate notation for provenance and workflow reproducibility},
author = {Michaelides, Danius T. and Parker, Richard and Charlton, Chris and Browne, William J. and Moreau, Luc},
editor = {Mattoso, Marta and Glavic, Boris},
series = {Lecture notes in computer science},
pages = {83-94},
publisher = {Springer International Publishing},
year = {2016},
urldate = {2018-07-23},
volume = {9672},
isbn = {978-3-319-40592-6},
issn = {0302-9743},
doi = {10.1007/978-3-319-40593-3\_7},
address = {Cham},
f1000-projects = {CWLProv},
abstract = {We present a technique to capture retrospective provenance across a number of tools in a statistical software suite. Our goal is to facilitate portability of processes between the tools to enhance usability and to support reproducibility. We describe an intermediate notation to aid runtime capture of provenance and demonstrate conversion to an executable and editable workflow. The notation is amenable to conversion to {PROV} via a template expansion mechanism. We discuss the impact on our system of recording this intermediate notation in terms of runtime performance and also the benefits it brings.}
}
@article{herschel_2017,
title = {A survey on provenance: What for? What form? What from?},
author = {Herschel, Melanie and Diestelkämper, Ralf and Ben Lahmar, Houssem},
pages = {881-906},
year = {2017},
month = {dec},
urldate = {2018-07-23},
journal = {{VLDB} J.},
volume = {26},
number = {6},
f1000-projects = {CWLProv},
abstract = {Provenance refers to any information describing the production process of an end product, which can be anything from a piece of digital data to a physical object. While this survey focuses on the former type of end product, this definition still leaves room for many different interpretations of and approaches to provenance. These are typically motivated by different application domains for provenance (e.g., accountability, reproducibility, process debugging) and varying technical requirements such as runtime, scalability, or privacy. As a result, we observe a wide variety of provenance types and provenance-generating methods. This survey provides an overview of the research field of provenance, focusing on what provenance is used for (what for?), what types of provenance have been defined and captured for the different applications (what form?), and which resources and system requirements impact the choice of deploying a particular provenance solution (what from?). For each of these three key questions, we provide a classification and review the state of the art for each class. We conclude with a summary and possible future research challenges.}
}
@article{bechhofer_2013,
title = {Why linked data is not enough for scientists},
author = {Bechhofer, Sean and Buchan, Iain and De Roure, David and Missier, Paolo and Ainsworth, John and Bhagat, Jiten and Couch, Philip and Cruickshank, Don and Delderfield, Mark and Dunlop, Ian and Gamble, Matthew and Michaelides, Danius and Owen, Stuart and Newman, David and Sufi, Shoaib and Goble, Carole},
pages = {599-611},
year = {2013},
urldate = {2018-07-23},
journal = {Future Gener. Comput. Syst.},
volume = {29},
number = {2},
f1000-projects = {CWLProv}
}

@book{services_2012,
    title = {Information Storage and Management: Storing, Managing, and Protecting Digital Information in Classic, Virtualized, and Cloud Environments, Second Edition},
    editor = {Somasundaram Gnanasundaram and Alok Shrivastava},
    author = {{EMC Education Services}},
    publisher = {John Wiley \& Sons},
    year = {2012},
    month = {apr},
    note = {ISBN 978-1-118-09483-9},
    isbn={9781118094839}
}


@Book{Anderberg83,
   author =   {Anderberg, M. R.},
   title =    {Cluster Analysis for Applications},
   publisher =    {New York: Academic Press},
   year =     {1983},
 }

  @Manual{R:2010,
    title = {R: A Language and Environment for Statistical Computing},
    author = {{R Development Core Team}},
    organization = {Vienna, Austria: R Foundation for Statistical Computing},
    address = {},
    year = {2012},
    note = {{ISBN} 3-900051-07-0, http://www.R-project.org},
    url = {http://www.R-project.org},
  }

  @Manual{R:2008,
    title = {R: A Language and Environment for Statistical Computing},
    author = {{R Development Core Team}},
    organization = {R Foundation for Statistical Computing},
    address = {Vienna, Austria},
    year = {2008},
    note = {{ISBN} 3-900051-07-0},
    url = {http://www.R-project.org},
  }

  @Article{Arabie80,
   author =   {Arabie, P. and Carroll, J. D.},
   title =    {MAPCLUS: A mathematical programming approach to fitting the ADCLUS models},
   journal =      {Psychometrika},
   year =     {1980 },
   volume =   {445},
   pages =    {211-35},
 }


@book{Tufte:1990,
	Address = {Cheshire, Connecticut},
	Author = {E. R. Tufte},
	Publisher = {Graphics Press},
	Title = {{Envisioning Information}},
	Year = {1990}}


@book{Tufte:1983,
	Address = {Cheshire},
	Author = {E. R. Tufte},
	Publisher = {Graphics Press},
	Title = {{The Visual Display of Quantitative Information}},
	Year = {1983}}

@book{Cleveland:1994,
	Address = {Summit},
	Author = {W. S. Cleveland},
	Edition = {Revised},
	Publisher = {Hobart Press},
	Title = {{The Elements of Graphing Data}},
	Year = {1994}}

@book{Cleveland:1993,
	Address = {Summit},
	Author = {W. S. Cleveland},
	Publisher = {Hobart Press},
	Title = {{Vizualizing Data}},
	Year = {1993}}

 @TECHREPORT{Ball65 ,
   AUTHOR =       {Ball, G. H. and Hall, D. J. },
   TITLE =        {A novel method of data analysis and pattern classification },
   INSTITUTION =  {Stanford Research Institute, California },
   YEAR =         {1965 },
 }

 @Article{Banfield93,
   author =   {Banfield, J. D. and Raftery, A. E.},
   title =    {Model-Based Gaussian and Non-Gaussian Clustering },
   journal =      {Biometrics },
   year =     {1993 },
   volume =   {49},
   pages =    {803--21},
 }

 @Article{Beale69,
   author =   {Beale, E. M. L.},
   title =    {Euclidean cluster analysis },
   journal =      {Bulletin of the International Statistical Institute },
   year =     {1969 },
   volume =   {43},
   pages =    {92-94},
 }

 @Article{Bensmail,
   author =   {Bensmail, H. },
   title =    {Model-based Clustering with Noise: Bayesian inference and estimation },
 }

@Article{Bezdek74,
   author =   {Bezdek, J. C.},
   title =    {Numerical taxonomy with fuzzy sets},
   journal =      {Journal of Methematical Biology},
   year =     {1974},
   volume =   {1},
   pages =    {57-71},
 }

@article{Cox:1972,
	Author = {D. R. Cox},
	Journal = {J. R. Statist. Soc. {\rm B}},
	Pages = {187--220},
	Title = {{Regression models and life tables (with Discussion)}},
	Volume = {34},
	Year = {1972}}

@article{Hear:Holm:Step:quan:2006,
	Author = {Heard, Nicholas A. and Holmes, Christopher C. and Stephens, David A.},
	Journal = {J. Am. Statist. Assoc.},
	Keywords = {bayesian hierarchical clustering; gene expression profiles; Microarrays},
	Pages = {18--29},
	Title = {A Quantitative Study of Gene Regulation Involved in the Immune Response of {A}nopheline Mosquitoes: {A}n Application of {B}ayesian Hierarchical Clustering of Curves},
	Volume = {101},
	Year = {2006}}

@article{Fan:2004,
	Author = {Fan, J. and Peng, H.},
	Journal = {Ann. Statist.},
	Pages = {928--61},
	Title = {{Nonconcave penalized likelihood with a diverging number of parameters}},
	Volume = {32},
	Year = {2004}}
	
@article{ELMROTH2010245,
    title = "Three fundamental dimensions of scientific workflow interoperability: Model of computation, language, and execution environment",
    journal = "Future Generation Computer Systems",
    volume = "26",
    number = "2",
    pages = "245 - 256",
    year = "2010",
    issn = "0167-739X",
    doi = "https://doi.org/10.1016/j.future.2009.08.011",
    url = "http://www.sciencedirect.com/science/article/pii/S0167739X09001174",
    author = "Erik Elmroth and Francisco Hernández and Johan Tordsson",
    keywords = "Scientific workflows, Workflow interoperability, Workflow languages, Model of computation, Grid interoperability",
    abstract = "We investigate interoperability aspects of scientific workflow systems and argue that the workflow execution environment, the model of computation (MoC), and the workflow language form three dimensions that must be considered depending on the type of interoperability sought: at the activity, sub-workflow, or workflow levels. With a focus on the problems that affect interoperability, we illustrate how these issues are tackled by current scientific workflows as well as how similar problems have been addressed in related areas. Our long-term objective is to achieve (logical) interoperability between workflow systems operating under different MoCs, using distinct language features, and sharing activities running on different execution environments."
}	
@inproceedings{Mohan2014,
  doi = {10.1109/scc.2014.53},
  year  = {2014},
  month = {jun},
  publisher = {{IEEE}},
  author = {Aravind Mohan and Shiyong Lu and Alexander Kotov},
  title = {Addressing the Shimming Problem in Big Data Scientific Workflows},
  booktitle = {2014 {IEEE} International Conference on Services Computing}
}
@article{caoprovone,
  title={ProvONE: Extending PROV to support the DataONE scientific community},
  author={Cao, Yang and Jones, Christopher and Cuevas-Vicentt{\i}n, V{\i}ctor and Jones, Matthew B and Lud{\"a}scher, Bertram and McPhillips, Timothy and Missier, Paolo and Schwalm, Christopher and Slaughter, Peter and Vieglais, Dave and Walker, Lauren
and Wei, Yaxing},
  year = {2016},
  month = {jun},
  day = {6},
  booktitle = {PROV Three Years Later; Workshop at Provenance Week 2016},
  url = {http://homepages.cs.ncl.ac.uk/paolo.missier/doc/dataone-prov-3-years-later.pdf},
  urldate = {2018-11-29}
}

@article{datacrate_2018,
  author = {Peter Sefton and Michael Lynch and Gerard Devine and Duncan Loxton},
 title = {DataCrate: a method of packaging, distributing, displaying and archiving Research Objects},
  month        = jul,
  year         = 2018,
  note         = {{Workshop on Research Objects (RO2018) at IEEE eScience 2018}},
  doi          = {10.5281/zenodo.1312323},
}

@misc{cwltool,
  author       = {Peter Amstutz and
                  Michael R. Crusoe and
                  Farah Zaib Khan and
                  Stian Soiland-Reyes and
                  Manvendra Singh and
                  Kapil kumar and
                  Anton Khodak and
                  Pau Ruiz Safont and
                  John Chilton and
                  Thomas Hickman and
                  boysha and
                  Josh Holland and
                  Brad Chapman and
                  Michael Kotliar and
                  Dan Leehr and
                  Guillermo Carrasco and
                  Andrey Kartashov and
                  Joshua C. Randall and
                  Nebojsa Tijanic and
                  Hervé Ménager and
                  Tomoya Tanjo and
                  James J Porter and
                  Gijs Molenaar and
                  Michael Tong and
                  Giacomo Tagliabue and
                  Denis Yuen and
                  Alejandro Barrera and
                  Sinisa Ivkovic and
                  Ryan Spangler and
                  psaffrey-illumina},
  title        = {{common-workflow-language/cwltool: 
                   1.0.20181012180214}},
  month        = oct,
  year         = 2018,
  doi          = {10.5281/zenodo.1471589},
  publisher    = {Zenodo},
}


@article{Conesa2016,
  doi = {10.1186/s13059-016-0881-8},
  year  = {2016},
  month = {jan},
  publisher = {Springer Nature},
  volume = {17},
  number = {1},
  author = {Ana Conesa and Pedro Madrigal and Sonia Tarazona and David Gomez-Cabrero and Alejandra Cervera and Andrew McPherson and Micha{\l} Wojciech Szcze{\'{s}}niak and Daniel J. Gaffney and Laura L. Elo and Xuegong Zhang and Ali Mortazavi},
  title = {A survey of best practices for {RNA}-seq data analysis},
  journal = {Genome Biology}
}

@article{seo2012transcriptional,
  title={The transcriptional landscape and mutational profile of lung adenocarcinoma},
  author={Seo, Jeong-Sun and Ju, Young Seok and Lee, Won-Chul and Shin, Jong-Yeon and Lee, June Koo and Bleazard, Thomas and Lee, Junho and Jung, Yoo Jin and Kim, Jung-Oh and Shin, Jung-Young and others},
  journal={Genome Research},
  year={2012},
  publisher={Cold Spring Harbor Lab},
  doi={10.1101/gr.145144.112}
}

@article{robinson_2017b,
title = {{CWL} Viewer: the common workflow language viewer},
author = {Robinson, Mark and Soiland-Reyes, Stian and Crusoe, Michael R and Goble, Carole},
pages = {1075 (poster)},
url = {https://doi.org/10.7490/f1000research.1114375.1},
year = {2017},
month = {jul},
day = {6},
urldate = {2018-07-13},
journal = {F1000Research},
volume = {6},
number = {{ISCB} Comm J},
f1000-projects = {CWLProv}
}

@article{xu2018review,
  title={A review of somatic single nucleotide variant calling algorithms for next-generation sequencing data},
  author={Xu, Chang},
  journal={Computational and structural biotechnology journal},
  year={2018},
  publisher={Elsevier},
  doi = {10.1016/j.csbj.2018.01.003}
}

@article{vivian2017toil,
  title={Toil enables reproducible, open source, big biomedical data analyses},
  author={Vivian, John and Rao, Arjun Arkal and Nothaft, Frank Austin and Ketchum, Christopher and Armstrong, Joel and Novak, Adam and Pfeil, Jacob and Narkizian, Jake and Deran, Alden D and Musselman-Brown, Audrey and others},
  journal={Nature biotechnology},
  volume={35},
  number={4},
  pages={314},
  year={2017},
  doi={10.1038/nbt.3772},
  publisher={Nature Publishing Group}
}

@article {cwlairflow2018,
	author = {Kotliar, Michael and Kartashov, Andrey and Barski, Artem},
	title = {CWL-Airflow: a lightweight pipeline manager supporting Common Workflow Language},
	year = {2018},
	doi = {10.1101/249243},
	publisher = {Cold Spring Harbor Laboratory},
	abstract = {Abstract: Massive growth in the amount of research data and computational analysis has led to increased utilization of pipeline managers in biomedical computational research. However, each of more than 100 such managers uses its own way to describe pipelines, leading to difficulty porting workflows to different environments and therefore poor reproducibility of computational studies. For this reason, the Common Workflow Language (CWL) was recently introduced as a specification for platform-independent workflow description, and work began to transition existing pipelines and workflow managers to CWL. Here, we present CWL-Airflow, an extension for the Apache Airflow pipeline manager supporting CWL. CWL-Airflow utilizes CWL v1.0 specification and can be used to run workflows on standalone MacOS/Linux servers, on clusters, or on variety cloud platforms. A sample CWL pipeline for processing of ChIP-Seq data is provided. CWL-Airflow is available under Apache license v.2 and can be downloaded from https://github.com/Barski-lab/cwl-airflow.},
	journal = {bioRxiv}
}

@inproceedings{cristofori2013usage,
  title={Usage Record--Format Recommendation},
  author={Cristofori, A and Bologna, IGI and Gordon, J and London, STFC RAL and Kennedy, JA and Munich, RZG and M{\"u}ller-Pfefferkorn, R},
  booktitle={Open Grid Forum},
  year={2013},
  note = {GFD-R-P.204},
  url = {https://www.ogf.org/documents/GFD.204.pdf}
  
}

@inproceedings{alper2018labelflow,
  title={LabelFlow Framework for Annotating Workflow Provenance},
  author={Alper, Pinar and Belhajjame, Khalid and Curcin, Vasa and Goble, Carole A},
  booktitle={Informatics},
  volume={5},
  number={1},
  pages={11},
  year={2018},
  doi={10.3390/informatics5010011},
  organization={Multidisciplinary Digital Publishing Institute} }

@inproceedings{kanwal2017digital,
  title={Digital reproducibility requirements of computational genomic workflows},
  author={Kanwal, Sehrish and Lonie, Andrew and Sinnott, Richard O},
  booktitle={Bioinformatics and Biomedicine (BIBM), 2017 IEEE International Conference on},
  pages={1522--1529},
  year={2017},
  organization={IEEE},
  doi={10.1109/bibm.2017.8217887}
}

@inproceedings{chen2011partitioning,
  title={Partitioning and scheduling workflows across multiple sites with storage constraints},
  author={Chen, Weiwei and Deelman, Ewa},
  booktitle={International Conference on Parallel Processing and Applied Mathematics},
  pages={11--20},
  year={2011},
  organization={Springer},
  doi={10.1007/978-3-642-31500-8_2}
}

@article{Dobin2012,
  doi = {10.1093/bioinformatics/bts635},
  url = {https://doi.org/10.1093/bioinformatics/bts635},
  year  = {2012},
  month = {oct},
  publisher = {Oxford University Press ({OUP})},
  volume = {29},
  number = {1},
  pages = {15--21},
  author = {Alexander Dobin and Carrie A. Davis and Felix Schlesinger and Jorg Drenkow and Chris Zaleski and Sonali Jha and Philippe Batut and Mark Chaisson and Thomas R. Gingeras},
  title = {{STAR}: ultrafast universal {RNA}-seq aligner},
  journal = {Bioinformatics}
}

@article{li2009sequence,
  title={The sequence alignment/map format and SAMtools},
  author={Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis, Goncalo and Durbin, Richard},
  journal={Bioinformatics},
  volume={25},
  number={16},
  pages={2078--2079},
  year={2009},
  publisher={Oxford University Press},
  doi={10.1093/bioinformatics/btp352},
}

@article{dobin2015mapping,
  title={Mapping RNA-seq reads with STAR},
  author={Dobin, Alexander and Gingeras, Thomas R},
  journal={Current Protocols in Bioinformatics},
  volume={51},
  number={1},
  pages={11--14},
  year={2015},
  publisher={Wiley Online Library},
  doi={10.1002/0471250953.bi1114s51}
}

@article{DeLuca2012,
  doi = {10.1093/bioinformatics/bts196},
  year  = {2012},
  month = {apr},
  publisher = {Oxford University Press ({OUP})},
  volume = {28},
  number = {11},
  pages = {1530--1532},
  author = {David S. DeLuca and Joshua Z. Levin and Andrey Sivachenko and Timothy Fennell and Marc-Danie Nazaire and Chris Williams and Michael Reich and Wendy Winckler and Gad Getz},
  title = {{RNA}-{SeQC}: {RNA}-seq metrics for quality control and process optimization},
  journal = {Bioinformatics}
}

@article{li2011rsem,
  title={RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome},
  author={Li, Bo and Dewey, Colin N},
  journal={BMC Bioinformatics},
  volume={12},
  number={1},
  pages={323},
  year={2011},
  publisher={BioMed Central},
  doi={10.1186/1471-2105-12-323}
}

@inproceedings{arcp_ro2018,
  title        = {The Archive and Package (arcp) URI scheme},
  author       = {Soiland-Reyes, Stian and Cáceres, Marcos},  
  booktitle    = {2018 IEEE 13th International Conference on e-Science (e-Science)},
  year         = {2018},
  month        = jul,
  note         = {In Print, arXiv:1809.06935},
  url          = {http://s11.no/2018/arcp.html}
}


@article{li2013aligning,
  title={Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM},
  author={Li, Heng},
  journal={arXiv preprint arXiv:1303.3997},
  year={2013},
  url={https://arxiv.org/abs/1303.3997}
}

@article{Saunders2012,
  doi = {10.1093/bioinformatics/bts271},
  year  = {2012},
  month = {may},
  publisher = {Oxford University Press ({OUP})},
  volume = {28},
  number = {14},
  pages = {1811--1817},
  author = {Christopher T. Saunders and Wendy S. W. Wong and Sajani Swamy and Jennifer Becq and Lisa J. Murray and R. Keira Cheetham},
  title = {Strelka: accurate somatic small-variant calling from sequenced tumor{\textendash}normal sample pairs},
  journal = {Bioinformatics}
}


@online{cwl-existing-workflow-systems,
  publisher = {{\relax Common Workflow Language project}},
  author = {{\relax Common Workflow Language project}},
  title = {Existing Workflow Systems},
  editor = {CWL Community},
  year = 2018,
  url = {https://s.apache.org/existing-workflow-systems},
  urldate = {2018-09-18},
  note = {Accessed 12 Sep 2018}
}	

@online{bagit17,
author = {{\relax Network Working Group}},
title = {draft-kunze-bagit-17 - The BagIt File Packaging Format (V1.0)},
url = {https://tools.ietf.org/html/draft-kunze-bagit-17},
month = {},
year = {2017},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@inproceedings{Missier2013,
 author = {Missier, Paolo and Belhajjame, Khalid and Cheney, James},
 title = {The W3C PROV Family of Specifications for Modelling Provenance Metadata},
 booktitle = {Proceedings of the 16th International Conference on Extending Database Technology},
 series = {EDBT '13},
 year = {2013},
 isbn = {978-1-4503-1597-5},
 location = {Genoa, Italy},
 pages = {773--776},
 numpages = {4},
 url = {http://doi.acm.org/10.1145/2452376.2452478},
 doi = {10.1145/2452376.2452478},
 acmid = {2452478},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@online{PROVDM,
author = {
    Luc Moreau and
    Paolo Missier and
    Khalid Belhajjame and
    Reza B'Far and
    James Cheney and
    Sam Coppens and
    Stephen Cresswell and
    Yolanda Gil and
    Paul Groth and
    Graham Klyne and
    Timothy Lebo and
    Jim McCusker and
    Simon Miles and
    James Myers and
    Satya Sahoo and
    Curt Tilmes
},
  title       = "{PROV}-DM: The {PROV} Data Model",
  month       = apr,
  note        = "http://www.w3.org/TR/2013/REC-prov-dm-20130430/",
  year        = "2013",
  bibsource   = "https://w2.syronex.com/jmr/w3c-biblio",
  type        = "{W3C} Recommendation",
  institution = "W3C",
}

@online{PROVXML,
author = {
    Hook Hua and
    Curt Tilmes and
    Stephan Zednik and
    Luc Moreau
},
title = {PROV-XML: The PROV XML Schema},
note = {W3C Working Group Note 30 April 2013},
url = {http://www.w3.org/TR/2013/NOTE-prov-xml-20130430/},
day = {30},
month = {apr},
year = {2013},
}


@online{PROVO,
author = {
    Timothy Lebo and
    Satya Sahoo and
    Deborah McGuinness and
    Khalid Belhajjame and
    James Cheney and
    David Corsar and
    Daniel Garijo and
    Stian Soiland-Reyes and
    Stephan Zednik and
    Jun Zhao
},
title = {PROV-O: The PROV Ontology},
note = {W3C Recommendation Note 30 April 2013},
url = {http://www.w3.org/TR/2013/REC-prov-o-20130430/},
day = {30},
month = {apr},
year = {2013},
}


@online{interope55:online,
author = {},
title = {interoperability | Definition of interoperability in English by Oxford Dictionaries},
url = {https://en.oxforddictionaries.com/definition/interoperability},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{portabil95:online,
author = {},
title = {portability | Definition of portability in English by Oxford Dictionaries},
url = {https://en.oxforddictionaries.com/definition/portability},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{DockerHu74:online,
author = {},
title = {Docker Hub},
url = {https://hub.docker.com/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{Conda,
author = {},
title = {Conda \textendash Conda documentation},
url = {https://conda.io/docs/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}
@online{Zenodo,
author = {},
title = {Zenodo - Research. Shared.},
url = {https://zenodo.org/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{GitHub,
author = {},
title = {GitHub},
url = {https://github.com/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{figshare,
author = {},
title = {figshare - credit for all your research},
url = {https://figshare.com/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{CodeasaR,
author = {},
title = {Code as a Research Object},
url = {http://mozillascience.github.io/code-research-object/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{W3CProvWorkingGroup,
author = {},
title = {W3C Provenance Incubator Group Wiki - XG Provenance Wiki},
url = {https://www.w3.org/2005/Incubator/prov/wiki/W3C\_Provenance\_Incubator\_Group\_Wiki},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{GATKBP,
author = {},
title = {GATK | BP Doc \#11165 | Data pre-processing for variant discovery},
url = {https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165},
year = {2018},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@misc{bioschemas,
  doi = {10.7490/f1000research.1114493.1},
  author = {Goble,  Carole and Jimenez,  Rafael and Gray,  Alasdair and Beard,  Niall and Profiti,  Giuseppe and Morrison,  Norman and {Bioschemas Consortium}},
  note = {Poster},
  title = {Bioschemas.org},
  publisher = {F1000Research},
  year = {2017}
}

 @article{Michel_2018, title={Bioschemas &amp; Schema.org: a Lightweight Semantic Layer for Life Sciences Websites}, volume={2}, ISSN={2535-0897}, DOI={10.3897/biss.2.25836}, journal={Biodiversity Information Science and Standards}, publisher={Pensoft Publishers}, author={Michel, Franck and The Bioschemas Community}, year={2018}, month={May}, pages={e25836}}


@online{reusevocab,
author = {
    Bernadette Farias Lóscio and 
    Caroline Burle and
    Newton Calegari and
    Annette Greiner
    Antoine Isaac and
    Carlos Iglesias and
    Carlos Laufer and 
    Christophe Guéret and 
    Deirdre Lee and
    Doug Schepers and
    Eric G. Stephan and
    Eric Kauz and
    Ghislain A. Atemezing and
    Hadley Beeman and
    Ig Ibert Bittencourt and
    João Paulo Almeida and
    Makx Dekkers and 
    Peter Winstanley and
    Phil Archer and 
    Riccardo Albertoni and 
    Sumit Purohit and 
    Yasodara Córdova},
title = {Data on the Web Best Practices},
url = {https://www.w3.org/TR/2017/REC-dwbp-20170131/#ReuseVocabularies},
day = {31},
month = {jan},
year = {2017},
urldate = {2018-09-22},
note = {W3C Recommendation 31 January 2017}
}

@online{roprinciples,
author = {},
title = {researchobject.org},
url = {http://www.researchobject.org/overview/},
month = {},
year = {},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{PROVModel,
author = {
    Yolanda Gil and
    Simon Miles and
    Khalid Belhajjame and
    Helena Deus and
    Daniel Garijo and
    Graham Klyne and
    Paolo Missier and
    Stian Soiland-Reyes and
    Stephan Zednik
},
title = {PROV Model Primer},
url = {http://www.w3.org/TR/2013/NOTE-prov-primer-20130430/},
year = {2013},
month = {apr},
day = {30},
note = {W3C Working Group Note 30 April 2013},
}

@online{wf4ever1,
author = {
    Stian Soiland-Reyes and
    Sean Bechhofer and
    Oscar Corcho and
    Graham Klyne and
    Khalid Belhajjame and
    Daniel Garijo and
    Esteban García Cuesta and
    Raul Palma
},
title = {The Wfdesc Ontology},
url = {https://w3id.org/ro/2016-01-28/wfdesc},
year = {2016},
month = {Jan},
day = {28},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{wf4ever2,
author = {
    Stian Soiland-Reyes and
    Sean Bechhofer and
    Oscar Corcho and
    Graham Klyne and
    Khalid Belhajjame and
    Daniel Garijo and
    Esteban García Cuesta and
    Raul Palma
},
title = {The Wfprov Ontology},
url = {https://w3id.org/ro/2016-01-28/wfprov/},
year = {2016},
urldate = {2018-09-22},
note = {Accessed 22 Sep 2018}
}

@online{JSONLD,
author = {
    Manu Sporny and
    Dave Longley and
    Gregg Kellogg and
    Markus Lanthaler and
    Niklas Lindström 
},
title = {JSON-LD 1.0: A JSON-based Serialization for Linked Data},
url = {http://www.w3.org/TR/2014/REC-json-ld-20140116/},
year = {2014},
month = {jan},
day = {16},
urldate = {2018-09-22},
note = {W3C Recommendation 16 January 2014}
}

@misc{cwlprov,
  author       = {Stian Soiland-Reyes and
                  Farah Zaib Khan and
                  Michael R Crusoe},
  title        = {common-workflow-language/cwlprov: CWLProv 0.6.0},
  month        = oct,
  year         = 2018,
  publisher    = {Zenodo},
  doi          = {10.5281/zenodo.1471585},
}


@online{provpython,
title = {prov 1.5.2},
url = {https://pypi.org/project/prov/},
year = {2018},
urldate = {2018-09-23},
note = {Accessed 22 Sep 2018}
}

@online{cwltool-controlflow,
title = {common-workflow-language/cwltool:Common Workflow Language reference implementation},
url = {https://github.com/common-workflow-language/cwltool#cwl-tool-control-flow},
year = {2016},
urldate = {2018-09-23}, 
note = {Accessed 23 Sep 2018}
}

@online{rnaseq,
title = {TopMed RNA-seq workflow},
url = {https://w3id.org/cwl/view/git/027e8af41b906173aafdb791351fb29efc044120/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl},
year = {2018},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{heliumda,
title = {heliumdatacommons},
url = {https://github.com/heliumdatacommons},
year = {2017},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{gtexpipe,
author = {},
title = {Gtex RNA-seq pipeline},
url = {https://github.com/broadinstitute/gtex-pipeline/tree/master/rnaseq},
year = {2017},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{MarkDuplicates,
author = {},
title = {Tool documentation: MarkDuplicates},
url = {http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates},
year = {},
urldate = {2018-09-23}, 
note = {Accessed 23 Sep 2018}
}

@online{alignment-wf,
author = {},
title = {topmed-workflows/topmed-alignment.cwl at cwlprov\_testing · FarahZKhan/topmed-workflows},
url = {https://github.com/FarahZKhan/topmed-workflows/blob/2fcea0b9469b572399755b6828ff87a40d865e43/aligner/sbg-alignment-cwl/topmed-alignment.cwl},
month = {},
year = {},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{docker-alignment,
author = {},
title = {statgen/docker-alignment: Dockerfile for Alignment},
url = {https://github.com/statgen/docker-alignment},
year = {2017},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{Abecasis14,
title = {Abecasis Lab - Genome Analysis Wiki},
url = {https://genome.sph.umich.edu/wiki/Abecasis_Lab},
year = {2017},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{DataBios21,
title = {Data Biosphere},
url = {https://github.com/DataBioSphere},
year = {2018},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{DataComm86,
author = {},
title = {Data Commons | NIH Common Fund},
url = {https://commonfund.nih.gov/commons},
month = {},
year = {2018},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{TransOmi24,
title = {Trans-Omics for Precision Medicine (TOPMed) Program | National Heart, Lung, and Blood Institute (NHLBI)},
url = {https://www.nhlbi.nih.gov/science/trans-omics-precision-medicine-topmed-program},
year = {2014},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{variantcallinglecture,
title = {Introduction to Variant Calling},
url = {https://bioconductor.org/help/course-materials/2014/CSAMA2014/3_Wednesday/lectures/VariantCallingLecture.pdf},
year = {2014},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{bcbio,
title = {Blue Collar Bioinformatics},
url = {http://bcb.io/},
year = {},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{bcbiowf,
title = {Somatic Variant Calling Workflow},
url = {https://github.com/FarahZKhan/bcbio_test_cwlprov/blob/master/somatic/somatic-workflow/main-somatic.cwl},
year = {2018},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{bcbiocwl,
title = {Common Workflow Language (CWL) \textendash bcbio-nextgen 1.1.0 documentation},
url = {https://bcbio-nextgen.readthedocs.io/en/latest/contents/cwl.html#current-status},
year = {2017},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@online{Nectar,
title = {Nectar Cloud - Nectar},
url = {https://nectar.org.au/research-cloud/},
year = {},
urldate = {2018-09-23},
note = {Accessed 23 Sep 2018}
}

@article{Carata2014,
  doi = {10.1145/2596628},
  url = {https://doi.org/10.1145/2596628},
  year  = {2014},
  month = {may},
  publisher = {Association for Computing Machinery ({ACM})},
  volume = {57},
  number = {5},
  pages = {52--60},
  author = {Lucian Carata and Sherif Akoush and Nikilesh Balakrishnan and Thomas Bytheway and Ripduman Sohan and Margo Selter and Andy Hopper},
  title = {A primer on provenance},
  journal = {Communications of the {ACM}}
}

@misc{cwlprov-py,
        author = {Stian Soiland-Reyes and Farah Zaib Khan},
        title  = {{common-workflow-language/cwlprov-py: cwlprov-py
                0.1.1}},
        month  = oct,
        year   = 2018,
        publisher    = {Zenodo},
        doi    = {10.5281/zenodo.1471376},
        url    = {https://doi.org/10.5281/zenodo.1471376}
}

@online{OBFTravel,
title = {OBF Travel Fellowship Program | OBF News},
url = {https://news.open-bio.org/2016/03/01/obf-travel-fellowship-program/},
year = {2016},
urldate = {2018-09-26},
note = {Accessed 26 Sep 2018}
}

@article{Goble2010,
  doi = {10.1093/nar/gkq429},
  url = {https://doi.org/10.1093/nar/gkq429},
  year  = {2010},
  month = {may},
  publisher = {Oxford University Press ({OUP})},
  volume = {38},
  number = {suppl{\_}2},
  pages = {W677--W682},
  author = {Carole A. Goble and Jiten Bhagat and Sergejs Aleksejevs and Don Cruickshank and Danius Michaelides and David Newman and Mark Borkum and Sean Bechhofer and Marco Roos and Peter Li and David De Roure},
  title = {{myExperiment}: a repository and social network for the sharing of bioinformatics workflows},
  journal = {Nucleic Acids Research}
}

@article{kim2016assessing,
  title={Assessing Run-time Overhead of Securing Kepler},
  author={Kim, Donghoon and Vouk, Mladen A},
  journal={Procedia Computer Science},
  volume={80},
  pages={2281--2286},
  year={2016},
  publisher={Elsevier},
  DOI={10.1016/j.procs.2016.05.412},  
}


@incollection{Missier2016,
  doi = {10.1007/978-3-319-40226-0_8},
  url = {https://doi.org/10.1007/978-3-319-40226-0_8},
  year  = {2016},
  publisher = {Springer International Publishing},
  pages = {127--137},
  author = {Paolo Missier},
  title = {The Lifecycle of Provenance Metadata and Its Associated Challenges and Opportunities},
  booktitle = {Building Trust in Information}
}

@article{Goble2014,
  doi = {10.1109/mic.2014.88},
  url = {https://doi.org/10.1109/mic.2014.88},
  year  = {2014},
  month = {sep},
  publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
  volume = {18},
  number = {5},
  pages = {4--8},
  author = {Carole Goble},
  title = {Better Software,  Better Research},
  journal = {{IEEE} Internet Computing}
}

@online{CodeIsSc,
    title = {Code Is Science},
    url = {http://www.codeisscience.com/},
    urldate = {2018-10-04},
    lastvisited = {2018-10-04},
    note = {Accessed 03 Oct 2018}
}

@online{softwarecarpentry,
    title = {Software Carpentry},
    url = {https://software-carpentry.org/},
    urldate = {2018-10-03},
    lastvisited = {2018-10-03},
    note = {Accessed 03 Oct 2018}  
}


@online{arvados,
    title = {Arvados - Open Source Big Data Processing and Bioinformatics},
    url = {https://arvados.org/},
    urldate = {2018-11-08},
    note = {Accessed 08 Nov 2018}  
}


@article{Cochrane2012,
  doi = {10.1093/nar/gks1175},
  url = {https://doi.org/10.1093/nar/gks1175},
  year  = {2012},
  month = {nov},
  publisher = {Oxford University Press ({OUP})},
  volume = {41},
  number = {D1},
  pages = {D30--D35},
  author = {Guy Cochrane and Blaise Alako and Clara Amid and Lawrence Bower and Ana Cerde{\~{n}}o-T{\'{a}}rraga and Iain Cleland and Richard Gibson and Neil Goodgame and Mikyung Jang and Simon Kay and Rasko Leinonen and Xiu Lin and Rodrigo Lopez and Hamish McWilliam and Arnaud Oisel and Nima Pakseresht and Swapna Pallreddy and Youngmi Park and Sheila Plaister and Rajesh Radhakrishnan and Stephane Rivi{\`{e}}re and Marc Rossello and Alexander Senf and Nicole Silvester and Dmitriy Smirnov and Petra ten Hoopen and Ana Toribio and Daniel Vaughan and Vadim Zalunin},
  title = {Facing growth in the European Nucleotide Archive},
  journal = {Nucleic Acids Research}
}

@online{NIH-PILOT,
    title = {heliumdatacommons/cwl\_workflows: Example CWL Workflows that run on team Helium PIVOT architecture.},
    url = {https://github.com/heliumdatacommons/cwl_workflows},
    note = {Accessed 03 Oct 2018},
    urldate = {2018-10-11}
}

@article{RObundle,
      doi = {10.5281/zenodo.12586},
      author = {Soiland-Reyes,  Stian and Gamble,  Matthew and Haines,  Robert},
      keywords = {research object,  bundle,  zip,  linked data,  json-ld},
      title = {Research Object Bundle 1.0},
      publisher = {Zenodo},
      year = {2014}
}

@article{howe,
  doi = {10.1109/mcse.2012.62},
  url = {https://doi.org/10.1109/mcse.2012.62},
  year  = {2012},
  month = {jul},
  publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
  volume = {14},
  number = {4},
  pages = {36--41},
  author = {Bill Howe},
  title = {Virtual Appliances,  Cloud Computing,  and Reproducible Research},
  journal = {Computing in Science {\&} Engineering}
}

@article {tazro2018,
	author = {Ohta, Tazro and Tanjo, Tomoya and Ogasawara, Osamu},
	title = {Accumulating computational resource usage of genomic data analysis workflow to optimize cloud computing instance selection},
	year = {2018},
	doi = {10.1101/456756},
	abstract = {Background: Container virtualization technologies such as Docker became popular in the bioinformatics domain as they improve portability and reproducibility of software deployment. Along with software packaged in containers, the workflow description standards Common Workflow Language also enabled to perform data analysis on multiple different computing environments with ease. These technologies accelerate the use of on-demand cloud computing platform which can scale out according to the amount of data. However, to optimize the time and the budget on a use of cloud, users need to select a suitable instance type corresponding to the resource requirements of their workflows. Results: We developed CWL-metrics, a system to collect runtime metrics of Docker containers and workflow metadata to analyze resource requirement of workflows. We demonstrated the analysis by using seven transcriptome quantification workflows on six instance types. The result showed instance type options of lower financial cost and faster execution time with required amount of computational resources. Conclusions: The summary of resource requirements of workflow executions provided by CWL-metrics can help users to optimize the selection of cloud computing instance. The runtime metrics data also accelerate to share workflows among different workflow management frameworks.},
	comment = {Submitted to GigaScience},
	journal = {bioRxiv}
}

@online {biosimspacewebinar,
  title = {BioExcel Webinar #28: BioSimSpace – filling the gaps between molecular simulation codes},
  author = {Christopher Woods},
  publisher = {YouTube, BioExcel},
  year = {2018},
  month = {Jun},
  abstract = { 
  BioSimSpace (https://biosimspace.org) is a flagship software project from the CCP-BioSim (https://ccpbiosim.ac.uk) and HEC-BioSim (https://hecbiosim.ac.uk) biomolecular modelling communities. The project aims to make it easier for researchers to develop, share and re-use biomolecular simulation workflow nodes. The software problem in our community is that we have lots of different pieces of software that are all incompatible and are not interoperable. This forces the community to hand-write small scripts for converting between different file formats, or to generate different software input files. The resulting scripts are specific for the software and task performed, leading to the community generating lots of bespoke and brittle workflows. BioSimSpace solves this problem by providing an interoperability layer between the major molecular simulation packages. In effect, BioSimSpace provides the shims that fill in the gaps between codes. BioSimSpace python scripts can be run from the command line, used as workflow nodes in packages such as Knime, or run on the cloud as Jupyter notebooks. In this talk, we will show how BioSimSpace has been implemented, will talk about its capabilities and our future plans, and will demonstrate some BioSimSpace workflow nodes running on the cloud.
  },
  url = {https://youtu.be/pD8mhj3WElE?t=1599},
  urldate = {2018-11-29}
}


@online{biosimspace,
  title = {BioSimSpace},
  note = {Accessed 29 Nov 2018},
  year = {2018},
  url = {https://biosimspace.org/},
  urldate = {2018-11-29}
}

 @article{Angiuoli_2011, 
   author={Angiuoli, Samuel V. and White, James R. and Matalka, Malcolm and White, Owen and Fricke, W. Florian}, editor={Highlander, Sarah K.Editor}, 
   title={Resources and Costs for Microbial Sequence Analysis Evaluated Using Virtual Machines and Cloud Computing}, 
   ISSN={1932-6203}, 
   DOI={10.1371/journal.pone.0026624}, 
   journal={PLoS ONE}, 
   publisher={Public Library of Science (PLoS)},
   volume={6}, 
   number={10}, 
   pages={e26624},
   year={2011}, 
   month={Oct}
}


 @article{Bubak_2013, title={Evaluation of Cloud Providers for VPH Applications}, 
   DOI={10.1109/ccgrid.2013.54}, journal={2013 13th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing}, publisher={IEEE}, author={Bubak, M. and Kasztelnik, M. and Malawski, M. and Meizner, J. and Nowakowski, P. and Varma, S.}, year={2013}, month={May}}

 @article{Malawski_2013, title={Cost minimization for computational applications on hybrid cloud infrastructures}, volume={29}, ISSN={0167-739X}, DOI={10.1016/j.future.2013.01.004}, number={7}, journal={Future Generation Computer Systems}, publisher={Elsevier BV}, author={Malawski, Maciej and Figiela, Kamil and Nabrzyski, Jarek}, year={2013}, month={Sep}, pages={1786–1794}}

@article{mitchell_2017,
author = {Mitchell, Alex L and Scheremetjew, Maxim and Denise, Hubert and Potter, Simon and Tarkowska, Aleksandra and Qureshi, Matloob and Salazar, Gustavo A and Pesseat, Sebastien and Boland, Miguel A and Hunter, Fiona M I and ten Hoopen, Petra and Alako, Blaise and Amid, Clara and Wilkinson, Darren J and Curtis, Thomas P and Cochrane, Guy and Finn, Robert D},
title = {EBI Metagenomics in 2017: enriching the analysis of microbial communities, from sequence reads to assemblies},
journal = {Nucleic Acids Research},
volume = {46},
number = {D1},
pages = {D726-D735},
year = {2018},
doi = {10.1093/nar/gkx967},
}

@article{Kaur_2012, title={An Efficient Approach to Genetic Algorithm for Task Scheduling in Cloud Computing Environment}, volume={4}, ISSN={2074-9015}, DOI={10.5815/ijitcs.2012.10.09}, number={10}, journal={International Journal of Information Technology and Computer Science}, publisher={MECS Publisher}, author={Kaur, Shaminder and Verma, Amandeep}, year={2012}, month={Sep}, pages={74–79}}

@article{samblaster,
  doi = {10.1093/bioinformatics/btu314},
  url = {https://doi.org/10.1093/bioinformatics/btu314},
  year  = {2014},
  month = {may},
  publisher = {Oxford University Press ({OUP})},
  volume = {30},
  number = {17},
  pages = {2503--2505},
  author = {G. G. Faust and I. M. Hall},
  title = {{SAMBLASTER}: fast duplicate marking and structural variant read extraction},
  journal = {Bioinformatics}
}

@online{Mozilla,
    title = {Mozilla Science},
    url = {https://science.mozilla.org/},
    note = {Accessed 01 Dec 2018},
    urldate = {2018-12-01}
}

@online{DigitalOcean,
    title = {DigitalOcean - Cloud Computing, Simplicity at Scale},
    url = {https://www.digitalocean.com/},
    note = {Accessed 01 Dec 2018},
    urldate = {2018-12-01}
}

@online{AmazonEC,
    title = {Amazon EC2},
    url = {https://aws.amazon.com/ec2/},
    note = {Accessed 01 Dec 2018},
    urldate = {2018-12-01}
}

@online{GoogleCl,
    title = {Google Cloud including GCP \& G Suite
    url = {https://cloud.google.com/},
    note = {Accessed 01 Dec 2018},
    urldate = {2018-12-01}
}}

@online{Microsof,
    title = {Microsoft Azure Cloud Computing Platform \& Services},
    url = {https://azure.microsoft.com/en-us/},
    note = {Accessed 01 Dec 2018},
    urldate = {2018-12-01}
}

@misc{somatic_mendeley,
  doi = {10.17632/97hj93mkfd.3},
  author = {Khan, Farah Zaib and Soiland-Reyes,  Stian},
  title = {CWL run of Somatic Variant Calling Workflow (CWLProv 0.5.0 Research Object)},
  note = {v3},
  publisher = {Mendeley Data},
  year = {2018}
}

@misc{alignment_mendeley,
  doi = {10.17632/6wtpgr3kbj.1},
  author = {Khan, Farah Zaib and Soiland-Reyes, Stian},
  title = {CWL run of Alignment Workflow (CWLProv 0.6.0 Research Object)},
  note = {v1},
  publisher = {Mendeley Data},
  year = {2018}
}

@misc{rnaseq_mendeley,
  doi = {10.17632/xnwncxpw42.1},
  author = {Khan, Farah Zaib and Soiland-Reyes, Stian},
  title = {CWL run of RNA-seq Analysis Workflow (CWLProv 0.5.0 Research Object))},
  note = {v1},
  publisher = {Mendeley Data},
  year = {2018}
}

@article{Gaignard2014,
  doi = {10.1016/j.websem.2014.07.001},
  url = {https://doi.org/10.1016/j.websem.2014.07.001},
  year = {2014},
  month = dec,
  publisher = {Elsevier {BV}},
  volume = {29},
  pages = {19--30},
  author = {Alban Gaignard and Johan Montagnat and Bernard Gibaud and Germain Forestier and Tristan Glatard},
  title = {Domain-specific summarization of Life-Science e-experiments from provenance traces},
  journal = {Journal of Web Semantics}
}

@inproceedings {PoeM,
  title={From Scientific Workflow Patterns to 5-star Linked Open Data.},
  author={Gaignard, Alban and Skaf-Molli, Hala and Bihou{\'e}e, Audrey},
  booktitle={TaPP},
  year={2016}
}

@article{10.1093/gigascience/giz052,
    author = {Ohta, Tazro and Tanjo, Tomoya and Ogasawara, Osamu},
    title = "{Accumulating computational resource usage of genomic data analysis workflow to optimize cloud computing instance selection}",
    journal = {GigaScience},
    volume = {8},
    number = {4},
    year = {2019},
    month = {04},
    abstract = "{Container virtualization technologies such as Docker are popular in the bioinformatics domain because they improve the portability and reproducibility of software deployment. Along with software packaged in containers, the standardized workflow descriptors Common Workflow Language (CWL) enable data to be easily analyzed on multiple computing environments. These technologies accelerate the use of on-demand cloud computing platforms, which can be scaled according to the quantity of data. However, to optimize the time and budgetary restraints of cloud usage, users must select a suitable instance type that corresponds to the resource requirements of their workflows.We developed CWL-metrics, a utility tool for cwltool (the reference implementation of CWL), to collect runtime metrics of Docker containers and workflow metadata to analyze workflow resource requirements. To demonstrate the use of this tool, we analyzed 7 transcriptome quantification workflows on 6 instance types. The results revealed that choice of instance type can deliver lower financial costs and faster execution times using the required amount of computational resources.CWL-metrics can generate a summary of resource requirements for workflow executions, which can help users to optimize their use of cloud computing by selecting appropriate instances. The runtime metrics data generated by CWL-metrics can also help users to share workflows between different workflow management frameworks.}",
    issn = {2047-217X},
    doi = {10.1093/gigascience/giz052}
}


@article{Clark2014,
  doi = {10.1186/2041-1480-5-28},
  url = {https://doi.org/10.1186/2041-1480-5-28},
  year = {2014},
  publisher = {Springer Nature},
  volume = {5},
  number = {1},
  pages = {28},
  author = {Tim Clark and Paolo N Ciccarese and Carole A Goble},
  title = {Micropublications: a semantic model for claims,  evidence,  arguments and annotations in biomedical communications},
  journal = {Journal of Biomedical Semantics}
}


@online{5star,
    author = {James G. Kim and Michael Hausenblas et al.},
    title = {5 Star Open Data},
    url = {https://5stardata.info/},
    urldate = {2019-05-21},
    note = {Accessed 21 May 2019}  
}

@misc{GigaScienceData,
    author= {Khan, Farah Zaib and Soiland-Reyes, Stian and Sinnott, Richard O and Lonie,, Andrew and Goble Carole and Crusoe, Michael R},
    title = {Supporting data for ``Sharing interoperable workflow provenance: A review of best practices and their practical application in CWLProv''}, 
    publisher = {GigaScience Database},
    doi = {http://dx.doi.org/10.5524/100625},
    year = {2019}
    }