我的思路:对要进行比较的所有代码段进行词法分析,并转化为特定的标记(token)串,自己制定一个转换规则。接着,通过两两比较标记(token)串来确定代码之间的相似性,并由此确定代码之间抄袭的程度。
将这两个代码分别转换为token串后,基于算法RKR-GST( running-karp-rabin greedy-string-tiling)算法思想,循环求取两个标记串中未被匹配部分的最大公共子串,将其用空格代替,并根据公式求出两个token串A,B的相似度
源代码
#include
#include
#include
#include
#include
#include
#include
#define N 10000
#define M 10000
#define MAXSTRLEN 10000 //定义最大串长
typedef int status;
typedef unsigned char SString[MAXSTRLEN+1]; //串的定长顺序存储表示
SString a[3]={"int","long","short"};
SString b[2]={"float","double"};
SString c[15]={"&&","||","++","--","+","-","*","/","=",">=","<=","==","!=",">","<"};
SString d[12]={"[","]","{","}","(",")",",",";","'","#",";","."};
SString e[29]={"auto","break","case","char","const","continue","default","do","else","enum", "extern","for","goto","if","main","printf","register","return","signed","sizeof", "static","struct","switch","typedef","union","unsigned","void","while","volatile"};
HANDLE hOut;
DWORD written;
void ShadowWindowLine(char *str);
char type(char *str);
void token(char name[],char list[],char token[],FILE *table);
void simple(int MinMatchLen,FILE *fp1,FILE *fp2);
status replace(SString s,int pos,int len,int Ls);
int copy(float n);
void ShadowWindowLine(char *str)
{
SMALL_RECT rc;
CONSOLE_SCREEN_BUFFER_INFO bInfo; // 窗口缓冲区信息
WORD att0,att1,attBack;
int i, chNum = strlen(str);
GetConsoleScreenBufferInfo( hOut, &bInfo ); // 获取窗口缓冲区信息
// 计算显示窗口大小和位置
rc.Left = (bInfo.dwSize.X - chNum)/2 - 2;
rc.Top = 8; // 原代码段中此处为bInfo.dwSize.Y/2 - 2,但是如果您的DOS屏幕有垂直滚动条的话,还需要把滚动条下拉才能看到,为了方便就把它改为10
rc.Right = rc.Left + chNum + 4;
rc.Bottom = rc.Top + 4;
att0 = BACKGROUND_RED |BACKGROUND_BLUE; // 阴影属性
att1 = FOREGROUND_RED |FOREGROUND_GREEN |FOREGROUND_BLUE | FOREGROUND_INTENSITY | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY;// 文本属性
attBack = BACKGROUND_RED |BACKGROUND_GREEN |BACKGROUND_BLUE | BACKGROUND_INTENSITY; // 背景属性
// 设置阴影然后填充
COORD posShadow = {rc.Left+1, rc.Top+1}, posText = {rc.Left, rc.Top},posBack={0,0};
for (i=0;i<25;i++)
{
FillConsoleOutputAttribute(hOut, attBack,80, posBack, &written);
posBack.Y++;
}
for (i=0; i<5; i++)
{
FillConsoleOutputAttribute(hOut, att0, chNum + 4, posShadow, &written);
posShadow.Y++;
}
for (i=0;i<5;i++)
{
FillConsoleOutputAttribute(hOut, att1,chNum + 4, posText, &written);
posText.Y++;
}
// 写文本和边框
posText.X = rc.Left + 2;
posText.Y = rc.Top + 2;
WriteConsoleOutputCharacter(hOut, str, strlen(str), posText, &written);
SetConsoleTextAttribute(hOut, bInfo.wAttributes); // 恢复原来的属性
}
char type(char *str) //此函数判断单词类型
{
int i;
for(i=0;i<3;i++) //a中的关键字
{
if(strcmp(str,a[i])==0)
return 'K';
}
for(i=0;i<2;i++) //b中的关键字
{
if(strcmp(str,b[i])==0)
return 'E';
}
for(i=0;i<15;i++) //c中的符号
{
if(strcmp(str,c[i])==0)
return 'A';
}
for(i=0;i<12;i++) //d中符号
{
if(strcmp(str,d[i])==0)
return 'R';
}
for(i=0;i<29;i++) //e中的关键字
{
if(strcmp(str,e[i])==0)
return 'Y';
}
if(isdigit(str[0])) //0-9是数字
{
return 'N';
}
//一般的变量与字符
if(!isalnum(str[0]))
return 'H';
else return 'C';//变量
}
void token(char name[],char list[],char token[],FILE *table) //将两个文件中的字符串分别切割转换为token串
{
FILE *in,*out;
char ch,c,buffer[N],*link[M];
int i=0,j=0,k=0,LenLink=0;
if((in=fopen(name,"r+"))==NULL)
{
printf("源文件无法打开!\n");
exit(0);
}
if((out=fopen(list,"w+"))==NULL)
{
printf("文件写入失败!\n");
exit(0);
}
if((table=fopen(token,"w+"))==NULL)
{
printf("文件写入失败!\n");
exit(0);
}
while(!feof(in)) //逐字读取文件
{
ch=fgetc(in);
if(ch=='\t' || ch==' ' || ch== '\n') //去掉空格、制表符、回车
continue;
if(isalpha(ch)) //如果首字符是字母
{
while(isalnum(ch)&&(i<N)) //其他位是字母或数字
{
buffer[i++]=ch;
ch=fgetc(in);
}
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*(strlen(buffer)+1));
strcpy(link[j-1],buffer);
i=0;
fseek(in,-1L,1); //在文件当中定位
}
else if(isdigit(ch)) //如果首字符是数字
{
while(isalnum(ch)&&(i<N)) //其他位是字母或数字
{
buffer[i++]=ch;
ch=fgetc(in);
}
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*(strlen(buffer)+1));
strcpy(link[j-1],buffer);
i=0;
fseek(in,-1L,1);
}
else if(!isalnum(ch)) //如果首字符既不是数字也不是字母
{
if(ch!='\n'&&ch!=' '&&ch!='\t')
{
if(ch=='>'||ch=='<'||ch=='!') //以下代码实现超前搜索
{
if((c=fgetc(in))=='=') //>=,<=,!=这些需被认为是一个符号
{
buffer[i++]=ch;
buffer[i++]=c;
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*3);
strcpy(link[j-1],buffer);
i=0;
}
else
{
buffer[i++]=ch;
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2);
strcpy(link[j-1],buffer);
i=0;
fseek(in,-1L,1);
}
}
else if(ch=='+'||ch=='-'||ch=='&'||ch=='|'||ch=='=')
{
if((c=fgetc(in))==ch) //++,--,&&,||,==这些需被认为是一个符号
{
buffer[i++]=ch;
buffer[i++]=c;
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*3);
strcpy(link[j-1],buffer);
i=0;
}
else
{
buffer[i++]=ch;
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2);
strcpy(link[j-1],buffer);
i=0;
fseek(in,-1L,1);
}
}
else //其他符号
{
buffer[i++]=ch;
buffer[i]='\0';
link[j++]=(char *)malloc(sizeof(char)*2);
strcpy(link[j-1],buffer);
i=0;
}
}
}
}
LenLink = j-1; //存到link中的总长度
for(i=0;i<LenLink;i++) //打印token中的内容
{
c=type(link[i]); //
if(c=='N'||c=='A'||c=='R')//数字,符号在表中保留
fputs(link[i],table);
if(c=='C') //变量均替换为id
fputs("id",table);
if(c=='K')//关键字int,short,long替换为zh
fputs("zh",table);
if(c=='E')//关键字float,double替换为fu
fputs("fu",table);
if(c=='Y')//其他关键字不变
fputs(link[i],table);
if(c=='H')//汉字删掉
fputs("\0",table);
}
fclose(table);
fprintf(out,"\t***** 单词类型观察表 *****\n");//打印list中的内容
fprintf(out,"\t K --int,short,long \n");
fprintf(out,"\t E --float,double\n");
fprintf(out,"\t Y --其他关键字\n");
fprintf(out,"\t A --运算符号\n");
fprintf(out,"\t R --语言符号\n");
fprintf(out,"\t N --数字\n");
fprintf(out,"\t H --汉字\n");
fprintf(out,"\t C --一般变量或标识符\n");
fprintf(out,"\t*****************************\n");
for(i=0;i<LenLink;i++)
{
c=type(link[i]); //判断单词的类型
fputc('(',out);
fputc(c,out);
fputc(',',out);
fputs(link[i],out);
fputc(',',out);
fprintf(out,"%d",i);
fputc(')',out);
fputc('\n',out);
}
}
void simple(int MinMatchLen,FILE *fp1,FILE *fp2)//此函数计算相似度,MinMatchLen: 公共子串要达到的最小长度
{
SString A,B;
char ch,h;
int i=0,j=0,k,t,s,a=1,La,Lb,lena,lenb,x,y;
float n;
int MatchLen=0;//所有公共子串的总长度
int maxmatch;//当前最大公共子串长度
if ((fp1=fopen("f:\\token1.txt","r"))==NULL)//设定文件位于当前目录下,可更改为绝对路径
{
printf("文件打开失败!");
getch();
exit(0);
}
A[++i]=fgetc(fp1);
while(!feof(fp1))
A[++i]=fgetc(fp1);
fclose(fp1);
La=i-1;
printf("token串1长度为%d,",La);
if ((fp2=fopen("f:\\token2.txt","r"))==NULL)//设定文件位于当前目录下,可更改为绝对路径
{
printf("文件打开失败!");
getch();
exit(0);
}
B[++j]=fgetc(fp2);
while(!feof(fp2))
B[++j]=fgetc(fp2);
fclose(fp2);
Lb=j-1;
printf("token串2长度为%d\n",Lb);
printf("是否要查看这两个token串?Y/N ");
h=getchar();
if(h=='Y'||h=='y')
{
ShellExecute(NULL,"open","F:\\token1.txt",NULL,NULL,SW_SHOWNORMAL);
ShellExecute(NULL,"open","F:\\token2.txt",NULL,NULL,SW_SHOWNORMAL);
}
getchar();
printf("\n将超过指定长度的公共子串用空格替换,是否要查看细节?Y/N ");
ch=getchar();
lena=i-1;
lenb=j-1;
do
{
maxmatch=MinMatchLen;
for(i=1;i<=La;i++)
{
for(j=1;j<=Lb;j++)
{
k=0;
while((k<=La-i)&&(k<=Lb-j)&&(A[i+k]==B[j+k])&&((A[i+k]!='\0')||(B[j+k]!='\0'))&&(A[i+k]!=' ')&&(B[j+k]!=' '))
//串A的第i+k个字符与串B的第j+k个字符是否相等
k++;
if(k>maxmatch)
{
maxmatch=k;
x=i;
y=j;
}
}
}
if(maxmatch>MinMatchLen)
{
replace(A,x,maxmatch,La);
replace(B,y,maxmatch,Lb);
La=La-maxmatch+1;
Lb=Lb-maxmatch+1;
MatchLen+=maxmatch;
}
if(ch=='Y'||ch=='y')
{
printf("第%d次检查两串中的匹配串\n",a);
a++;
for(s=1;s<=La;s++)
printf("%c",A[s]);
printf("\n");
for(s=1;s<=Lb;s++)
printf("%c",B[s]);
printf("\n");
}
}
while(maxmatch>MinMatchLen);
printf("\n已经没有能够匹配的公共子串了\n");
n=(2.0*MatchLen)/(lena+lenb);
printf("公共子串的总长为%d,",MatchLen);
printf("根据公式\n");
printf("\t\t ——————————————————————————\n");
printf("\t\t| 相似度=(2×公共子串长度)÷(串A长度+串B长度) |\n");
printf("\t\t ——————————————————————————\n");
printf("这两串代码的相似度为%f\n",n);
copy(n);
}
status replace(SString s,int pos,int len,int Ls) //用空格来代替两个token串中的最大匹配子串
{
int i;
if(posLs-len+1||len<0)
return 0;
s[pos]=' ';
for(i=pos+len;i<=Ls;i++)
{
s[i-len+1]=s[i];
}
return 1;
}
int copy(float n) //此函数判断是否抄袭
{
printf("\n相似度超过0.8,则认为是抄袭");
if(n>=0.8)
printf("\n这两个代码有抄袭嫌疑,请做进一步检查");
else
printf("\n这两个代码没有抄袭嫌疑");
return 0;
}
void main(void)
{
hOut = GetStdHandle(STD_OUTPUT_HANDLE); // 获取标准输出设备句柄
SetConsoleOutputCP(936); // 设置代码页,此为中文简体
ShadowWindowLine(" 欢迎使用C语言代码复制/相似度检测软件 ");
getchar();
system("cls"); //清屏
char name1[50];
char name2[50]; //存储输入的文件路径字符串
FILE *f1,*f2;
system("color F3");
printf("\n代码1:");
scanf("%s",name1);
token(name1,"f:\list1.txt","f:\token1.txt",f1);
printf("代码2:");
scanf("%s",name2);
token(name2,"f:\list2.txt","f:\token2.txt",f2);
printf("\ntoken串已生成成功,");
getchar();
simple(3,f1,f2);
}