@article{Nafi-2020-A,
title = "A universal cross language software similarity detector for open source software categorization",
author = "Nafi, Kawser Wazed and
Roy, Banani and
Roy, Chanchal K. and
Schneider, Kevin A.",
journal = "Journal of Systems and Software, Volume 162",
volume = "162",
year = "2020",
publisher = "Elsevier BV",
url = "https://gwf-uwaterloo.github.io/gwf-publications/G20-115001",
doi = "10.1016/j.jss.2019.110491",
pages = "110491",
abstract = "Abstract While there are novel approaches for detecting and categorizing similar software applications, previous research focused on detecting similarity in applications written in the same programming language and not on detecting similarity in applications written in different programming languages. Cross-language software similarity detection is inherently more challenging due to variations in language, application structures, support libraries used, and naming conventions. In this paper we propose a novel model, CroLSim, to detect similar software applications across different programming languages. We define a semantic relationship among cross-language libraries and API methods (both local and third party) using functional descriptions and a word-vector learning model. Our experiments show that CroLSim can successfully detect cross-language similar software applications, which outperforms all existing approaches (mean average precision rate of 0.65, confidence rate of 3.6, and 75{\%} highly rated successful queries). Furthermore, we applied CroLSim to a source code repository to see whether our model can recommend cross-language source code fragments if queried directly with source code. From our experiments we found that CroLSim can recommend cross-language functional similar source code when source code is directly used as a query (average precision=0.28, recall=0.85, and F-Measure=0.40).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="Nafi-2020-A">
<titleInfo>
<title>A universal cross language software similarity detector for open source software categorization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kawser</namePart>
<namePart type="given">Wazed</namePart>
<namePart type="family">Nafi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Banani</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chanchal</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Journal of Systems and Software, Volume 162</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>Elsevier BV</publisher>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Abstract While there are novel approaches for detecting and categorizing similar software applications, previous research focused on detecting similarity in applications written in the same programming language and not on detecting similarity in applications written in different programming languages. Cross-language software similarity detection is inherently more challenging due to variations in language, application structures, support libraries used, and naming conventions. In this paper we propose a novel model, CroLSim, to detect similar software applications across different programming languages. We define a semantic relationship among cross-language libraries and API methods (both local and third party) using functional descriptions and a word-vector learning model. Our experiments show that CroLSim can successfully detect cross-language similar software applications, which outperforms all existing approaches (mean average precision rate of 0.65, confidence rate of 3.6, and 75% highly rated successful queries). Furthermore, we applied CroLSim to a source code repository to see whether our model can recommend cross-language source code fragments if queried directly with source code. From our experiments we found that CroLSim can recommend cross-language functional similar source code when source code is directly used as a query (average precision=0.28, recall=0.85, and F-Measure=0.40).</abstract>
<identifier type="citekey">Nafi-2020-A</identifier>
<identifier type="doi">10.1016/j.jss.2019.110491</identifier>
<location>
<url>https://gwf-uwaterloo.github.io/gwf-publications/G20-115001</url>
</location>
<part>
<date>2020</date>
<detail type="volume"><number>162</number></detail>
<detail type="page"><number>110491</number></detail>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T A universal cross language software similarity detector for open source software categorization
%A Nafi, Kawser Wazed
%A Roy, Banani
%A Roy, Chanchal K.
%A Schneider, Kevin A.
%J Journal of Systems and Software, Volume 162
%D 2020
%V 162
%I Elsevier BV
%F Nafi-2020-A
%X Abstract While there are novel approaches for detecting and categorizing similar software applications, previous research focused on detecting similarity in applications written in the same programming language and not on detecting similarity in applications written in different programming languages. Cross-language software similarity detection is inherently more challenging due to variations in language, application structures, support libraries used, and naming conventions. In this paper we propose a novel model, CroLSim, to detect similar software applications across different programming languages. We define a semantic relationship among cross-language libraries and API methods (both local and third party) using functional descriptions and a word-vector learning model. Our experiments show that CroLSim can successfully detect cross-language similar software applications, which outperforms all existing approaches (mean average precision rate of 0.65, confidence rate of 3.6, and 75% highly rated successful queries). Furthermore, we applied CroLSim to a source code repository to see whether our model can recommend cross-language source code fragments if queried directly with source code. From our experiments we found that CroLSim can recommend cross-language functional similar source code when source code is directly used as a query (average precision=0.28, recall=0.85, and F-Measure=0.40).
%R 10.1016/j.jss.2019.110491
%U https://gwf-uwaterloo.github.io/gwf-publications/G20-115001
%U https://doi.org/10.1016/j.jss.2019.110491
%P 110491
Markdown (Informal)
[A universal cross language software similarity detector for open source software categorization](https://gwf-uwaterloo.github.io/gwf-publications/G20-115001) (Nafi et al., GWF 2020)
ACL
- Kawser Wazed Nafi, Banani Roy, Chanchal K. Roy, and Kevin A. Schneider. 2020. A universal cross language software similarity detector for open source software categorization. Journal of Systems and Software, Volume 162, 162:110491.