@Article{Ganguly2018, author="Ganguly, Debasis and Jones, Gareth J. F. and Ram{\'i}rez-de-la-Cruz, Aar{\'o}n and Ram{\'i}rez-de-la-Rosa, Gabriela and Villatoro-Tello, Esa{\'u}", title="Retrieving and classifying instances of source code plagiarism", journal="Information Retrieval Journal", year="2018", month="Feb", day="01", volume="21", number="1", pages="1--23", abstract="Automatic detection of source code plagiarism is an important research field for both the commercial software industry and within the research community. Existing methods of plagiarism detection primarily involve exhaustive pairwise document comparison, which does not scale well for large software collections. To achieve scalability, we approach the problem from an information retrieval (IR) perspective. We retrieve a ranked list of candidate documents in response to a pseudo-query representation constructed from each source code document in the collection. The challenge in source code document retrieval is that the standard bag-of-words (BoW) representation model for such documents is likely to result in many false positives being retrieved, because of the use of identical programming language specific constructs and keywords. To address this problem, we make use of an abstract syntax tree (AST) representation of the source code documents. While the IR approach is efficient, it is essentially unsupervised in nature. To further improve its effectiveness, we apply a supervised classifier (pre-trained with features extracted from sample plagiarized source code pairs) on the top ranked retrieved documents. We report experiments on the SOCO-2014 dataset comprising 12K Java source files with almost 1M lines of code. Our experiments confirm that the AST based approach produces significantly better retrieval effectiveness than a standard BoW representation, i.e., the AST based approach is able to identify a higher number of plagiarized source code documents at top ranks in response to a query source code document. The supervised classifier, trained on features extracted from sample plagiarized source code pairs, is shown to effectively filter and thus further improve the ranked list of retrieved candidate plagiarized documents.", issn="1573-7659", doi="10.1007/s10791-017-9313-y", url="https://doi.org/10.1007/s10791-017-9313-y" }