所以我目前正在寫一些比較數據表中的列的東西。所以我基本上需要行,列,並且所有文本都是從.csv文件讀入的。C++與n-Gram最快的文本行比較?使用字符串,char *,矢量?
這個問題目前是需要多長時間來處理。我在C#中複製代碼,在行和列中使用'DataTable',需要大約13秒來處理1500行文本。 我的C++程序使用'vector < vector>'作爲表格,並且在這段時間內只處理175行文本。 算法是從我所看到的一樣,但我猜這件事情涉及到的數據類型使用C++,使這個慢我「米。
有沒有人對事業的想法?
//C++
void CheckColumn(int colNum)
{
for (int i = 1; i < RowCount; i++)
{
for (int j = i + 1; j < RowCount; j++)
{
nGram(Data[colNum][i], Data[colNum][j], 3);
}
}
}
double nGram(string one, string two, int count)
{
//Change to uppercase
transform(one.begin(), one.end(), one.begin(), toupper);
transform(two.begin(), two.end(), two.begin(), toupper);
//Set the first string to be the shorter one
if (one.size() > two.size())
{
string temp = one;
one = two;
two = temp;
}
//If nGram number is larger than the shortest string, set the nGram number to that length
if (one.size() < count)
{
count = one.size();
}
//Add matches
double weight = 0;
double possibleMatches = (2 * two.size() - 2 * count + 2)/2;
for (int i = 0; i < (one.size() - count + 1); i++)
{
for (int j = 0; j < (two.size() - count + 1); j++)
{
if (one.substr(i, count) == two.substr(j, count))
{
weight += 1;
break;
}
}
}
//Check for indivisible situations and otherwise calculate the weight
if ((possibleMatches == 0) || (weight == 0))
{
weight = 0;
}
else
{
weight = weight/possibleMatches;
}
return weight;
}
//C#
void CompareColumn(int colNum)
{
for(int i = 0; i < table.Rows.Count; i++)
{
for (int j = i + 1; j < table.Rows.Count; j++)
{
StringFunctions.nGram(table.Rows[i][colNum].ToString(), table.Rows[j][colNum].ToString(), 3);
}
}
}
public double nGram(string one, string two, int count)
{
//Change to uppercase
one = one.ToUpper();
two = two.ToUpper();
//Set the first string to be the shorter one
if (one.Length > two.Length)
{
string temp = one;
one = two;
two = temp;
}
//If nGram number is larger than the shortest string, set the nGram number to that length
if (one.Length < count)
{
count = one.Length;
}
//Add matches
double doubleMatch = 0;
double possibleMatches = (2 * two.Length - 2 * count + 2)/2;
for (int i = 0; i < (one.Length - count + 1); i++)
{
for (int j = 0; j < (two.Length - count + 1); j++)
{
if (one.Substring(i, count) == two.Substring(j, count))
{
doubleMatch += 1;
break;
}
}
}
//Check for indivisible situations and otherwise calculate the weight
if ((possibleMatches == 0) || (doubleMatch == 0))
{
doubleMatch = 0;
}
else
{
doubleMatch = doubleMatch/possibleMatches;
}
return doubleMatch;
}
我看到的第一個問題是你正在通過價值與參考。你正在浪費時間進行復制建設。我建議的第一件事是改變你的nGram的簽名來讀取'''ngram(string&,string&,count) – Freddy 2014-10-27 03:50:40
你有沒有簡介,看看什麼是最花時間? – 2014-10-27 05:07:59
您還應該確保您正在使用優化代碼進行測試/分析。 – 2014-10-27 05:25:33