我已經寫了一個c程序,該程序應該將文件切片爲Rabin Karp algorithm。這是一個c#程序的改編,你可以找到Here。用rabin karp算法切片文件
它似乎工作,但問題依然存在。平均塊大小不是預期的。
用法如下:
拉賓總理WindowSize BoundaryMarker文件
其中:
拉賓是可執行文件的名稱。
總理是一個高素數。例如100007
WindowSize是滾動窗口的大小。例如48
BoundaryMarker處於指紋
文件設置爲0的比特的數目被處理
如果我設置BoundaryMarker至13的文件,我期望平均信息塊尺寸是8K。事實上,他們都不在8K左右。
我很難弄清楚我的程序出了什麼問題? 你能幫我嗎?
感謝
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
unsigned char* buffer;
int windowSize;
int writePointer = 0;
int readPointer = 0;
int dataSize = 0;
unsigned char PushChar(unsigned char c)
{ if (++writePointer >= windowSize) writePointer=0;
buffer[writePointer]=c;
dataSize++;
return(c);
}
unsigned char PopChar(void)
{ if (++readPointer >= windowSize) readPointer=0;
dataSize--;
return(buffer[readPointer]);
}
int main(int argc, char *argv[])
{ int fd;
unsigned char c;
unsigned long Q;
unsigned long D=256;
unsigned long pow=1;
int i,k,boundary,boundaryMarker,index;
unsigned char s;
if (argc != 5)
{ printf("\nUsage : rabin Prime WindowSize BoundaryMarker File\n\nwhere :\n");
printf("Prime is a high prime number. For instance 100007\n\n");
printf("WindowSize is the size of rolling window. For instance 48\n\n");
printf("BoundaryMarker is the number of bits set to 0 in a fingerprint\n\n");
printf("File is the file to process\n\n");
return(1);
}
sscanf(argv[1],"%lu",&Q);
sscanf(argv[2],"%d",&windowSize);
sscanf(argv[3],"%d",&boundaryMarker);
for(i=1,boundary=1;i<=boundaryMarker;i++) boundary=boundary*2;
boundary --;
//printf("Q = %lu windowSize = %d boundary = %d\n",Q,windowSize,boundary);
if ((buffer=(unsigned char*) malloc (sizeof(unsigned char)*windowSize))==NULL) return(1);
for (k=1; k < windowSize; k++) pow=(pow*D)%Q;
//printf("pow value %lu\n",pow);
unsigned long sig=0;
int lastIndex=0;
if ((fd=open(argv[4],O_RDONLY))<0) exit(1);
for (i=0; i <windowSize; i++)
{ read(fd,&c,1);
PushChar(c);
sig=(sig*D + (unsigned long)c) %Q;
}
//printf("sig value = %lu\n",sig);
index=0; lastIndex=0;
while (read(fd,&c,1))
{
s=PopChar();
//printf("sig = (%lu + %lu - %lu * %lu %% %lu) %lu",sig,Q,pow,(unsigned long) s,Q,Q);
sig = (sig + Q - pow*(unsigned long)s%Q)%Q;
//printf(" = %lu\n",sig);
s=PushChar(c);
//printf("sig2 = (%lu * %lu + %lu) %% %lu",sig,D,(unsigned long) s,Q);
sig = (sig*D + (unsigned long)s)%Q;
//printf(" = %lu\n",sig);
index++;
if ((sig & boundary)==0)
{ if (index - lastIndex >= 2048)
{ printf("sig & boundary = %lu & %lu Index=%d chunk size=%d\n",sig,boundary,index,index-lastIndex);
lastIndex=index;
}
}
else if (index -lastIndex >=65536)
{ printf("sig & boundary = %lu & %lu Index=%d chunk size=%d\n",sig,boundary,index,index-lastIndex);
lastIndex=index;
}
}
printf("Index=%d chunk size=%d\n",index,index-lastIndex);
close(fd);
return 1;
}
可以通過步帶調試器的代碼,並關注變量及其值。它可能會幫助您找出問題所在。 –
這兩個程序(c和c#都給出了相同的結果)。我認爲這是一個算法問題。該算法看起來像sedgewick rabin karp實現。我不知道問題出在哪裏。 –