@article{Chakroborti-2018-Optimized,
title = "Optimized Storing of Workflow Outputs through Mining Association Rules",
author = "Chakroborti, Debasish and
Mondal, Manishankar and
Roy, Banani and
Roy, Chanchal K. and
Schneider, Kevin A.",
journal = "2018 IEEE International Conference on Big Data (Big Data)",
year = "2018",
publisher = "IEEE",
url = "https://gwf-uwaterloo.github.io/gwf-publications/G18-26001",
doi = "10.1109/bigdata.2018.8622351",
abstract = "Workflows are frequently built and used to systematically process large datasets using workflow management systems (WMS). A workflow (i.e., a pipeline) is a finite set of processing modules organized as a series of steps that is applied to an input dataset to produce a desired output. In a workflow management system, users generally create workflows manually for their own investigations. However, workflows can sometimes be lengthy and the constituent processing modules might often be computationally expensive. In this situation, it would be beneficial if users could reuse intermediate stage results generated by previously executed workflows for executing their current workflow.In this paper, we propose a novel technique based on association rule mining for suggesting which intermediate stage results from a workflow that a user is going to execute should be stored for reusing in the future. We call our proposed technique, RISP (Recommending Intermediate States from Pipelines). According to our investigation on hundreds of workflows from two scientific workflow management systems, our proposed technique can efficiently suggest intermediate state results to store for future reuse. The results that are suggested to be stored have a high reuse frequency. Moreover, for creating around 51{\%} of the entire pipelines, we can reuse results suggested by our technique. Finally, we can achieve a considerable gain (74{\%} gain) in execution time by reusing intermediate results stored by the suggestions provided by our proposed technique. We believe that our technique (RISP) has the potential to have a significant positive impact on Big-Data systems, because it can considerably reduce execution time of the workflows through appropriate reuse of intermediate state results, and hence, can improve the performance of the systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="Chakroborti-2018-Optimized">
<titleInfo>
<title>Optimized Storing of Workflow Outputs through Mining Association Rules</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debasish</namePart>
<namePart type="family">Chakroborti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manishankar</namePart>
<namePart type="family">Mondal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Banani</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chanchal</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>2018 IEEE International Conference on Big Data (Big Data)</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>IEEE</publisher>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Workflows are frequently built and used to systematically process large datasets using workflow management systems (WMS). A workflow (i.e., a pipeline) is a finite set of processing modules organized as a series of steps that is applied to an input dataset to produce a desired output. In a workflow management system, users generally create workflows manually for their own investigations. However, workflows can sometimes be lengthy and the constituent processing modules might often be computationally expensive. In this situation, it would be beneficial if users could reuse intermediate stage results generated by previously executed workflows for executing their current workflow.In this paper, we propose a novel technique based on association rule mining for suggesting which intermediate stage results from a workflow that a user is going to execute should be stored for reusing in the future. We call our proposed technique, RISP (Recommending Intermediate States from Pipelines). According to our investigation on hundreds of workflows from two scientific workflow management systems, our proposed technique can efficiently suggest intermediate state results to store for future reuse. The results that are suggested to be stored have a high reuse frequency. Moreover, for creating around 51% of the entire pipelines, we can reuse results suggested by our technique. Finally, we can achieve a considerable gain (74% gain) in execution time by reusing intermediate results stored by the suggestions provided by our proposed technique. We believe that our technique (RISP) has the potential to have a significant positive impact on Big-Data systems, because it can considerably reduce execution time of the workflows through appropriate reuse of intermediate state results, and hence, can improve the performance of the systems.</abstract>
<identifier type="citekey">Chakroborti-2018-Optimized</identifier>
<identifier type="doi">10.1109/bigdata.2018.8622351</identifier>
<location>
<url>https://gwf-uwaterloo.github.io/gwf-publications/G18-26001</url>
</location>
<part>
<date>2018</date>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Optimized Storing of Workflow Outputs through Mining Association Rules
%A Chakroborti, Debasish
%A Mondal, Manishankar
%A Roy, Banani
%A Roy, Chanchal K.
%A Schneider, Kevin A.
%J 2018 IEEE International Conference on Big Data (Big Data)
%D 2018
%I IEEE
%F Chakroborti-2018-Optimized
%X Workflows are frequently built and used to systematically process large datasets using workflow management systems (WMS). A workflow (i.e., a pipeline) is a finite set of processing modules organized as a series of steps that is applied to an input dataset to produce a desired output. In a workflow management system, users generally create workflows manually for their own investigations. However, workflows can sometimes be lengthy and the constituent processing modules might often be computationally expensive. In this situation, it would be beneficial if users could reuse intermediate stage results generated by previously executed workflows for executing their current workflow.In this paper, we propose a novel technique based on association rule mining for suggesting which intermediate stage results from a workflow that a user is going to execute should be stored for reusing in the future. We call our proposed technique, RISP (Recommending Intermediate States from Pipelines). According to our investigation on hundreds of workflows from two scientific workflow management systems, our proposed technique can efficiently suggest intermediate state results to store for future reuse. The results that are suggested to be stored have a high reuse frequency. Moreover, for creating around 51% of the entire pipelines, we can reuse results suggested by our technique. Finally, we can achieve a considerable gain (74% gain) in execution time by reusing intermediate results stored by the suggestions provided by our proposed technique. We believe that our technique (RISP) has the potential to have a significant positive impact on Big-Data systems, because it can considerably reduce execution time of the workflows through appropriate reuse of intermediate state results, and hence, can improve the performance of the systems.
%R 10.1109/bigdata.2018.8622351
%U https://gwf-uwaterloo.github.io/gwf-publications/G18-26001
%U https://doi.org/10.1109/bigdata.2018.8622351
Markdown (Informal)
[Optimized Storing of Workflow Outputs through Mining Association Rules](https://gwf-uwaterloo.github.io/gwf-publications/G18-26001) (Chakroborti et al., GWF 2018)
ACL
- Debasish Chakroborti, Manishankar Mondal, Banani Roy, Chanchal K. Roy, and Kevin A. Schneider. 2018. Optimized Storing of Workflow Outputs through Mining Association Rules. 2018 IEEE International Conference on Big Data (Big Data).