所以我正在研究一個文本挖掘項目,並且正在嘗試實現信息增益。我有一個數據,其中每行描述一個文檔。所以一個新的行字符分裂不同的文件。爲什麼這個循環運行如此緩慢:c#
我必須生成一個矩陣,其中列是所有文檔中的所有不同的單詞,行是不同的文檔。如果該單詞存在或不存在於該文檔中,則該表中的每個單元格爲1(真)或0(假)。 共有987個文件,總詞數爲22860,總共有3680個單詞。因此3680個單詞與22860個單詞相比較。這個單詞運行緩慢,但它很好。花費更多時間的循環是當我遍歷單詞列表的對象以生成矩陣時。見下面
注意:我已經刪除了文件中的所有重複單詞。
class word_list
{
public string word;
public List<bool> doc= new List<bool>();
};//class ends
private void button2_Click(object sender, EventArgs e)
{
//Convert the string into an array of words
string[] w1 = richTextBox1.Text.Trim().Split('\n',' ').Select(x => x.Trim().ToLower()).Distinct().ToArray(); //all distinct words
string[] rich_doc = richTextBox1.Text.Trim().Split('\n'); //all documents array
List<word_list> words = new List<word_list>();
richTextBox2.Text+=("no. of distict words: " + w1.Length + ", no. of docs " + rich_doc.Length);
for (int i = 0; i < w1.Length; i++)
{
word_list temp = new word_list();
temp.word = w1[i]; //temp has the current distict word as class object
for(int j=0;j<rich_doc.Length;j++)//traverse all doc array
{
temp.doc.Add(false);
List<string> doc_word = Regex.Split(rich_doc[j], @"\b").Distinct(StringComparer.CurrentCultureIgnoreCase).ToList();
//richTextBox2.Text += ("\n no. of words in this doc: " + doc_word.Count);
//richTextBox2.SelectionStart = richTextBox1.Text.Length;
//richTextBox2.Focus();
int doc_count = doc_word.Count; // number of docs
for (int k = 0; k < doc_count; k++)//All words in a doc are compared
{
if(doc_word[k].ToLower() == w1[i].ToLower())
{
temp.doc[temp.doc.Count-1]=true;
break;
}
}
}
if ((words.Count - 1)>=0)
richTextBox2.Text += ("\n word(" + words.Count + "/" + w1.Length + "): " + words[words.Count - 1].word);
richTextBox2.SelectionStart = richTextBox1.Text.Length;
richTextBox2.Focus();
words.Add(temp);
}
//generate matrix
int t = rich_doc.Length; //no. of docs
int word_count = words.Count;
richTextBox1.Text = "Doc";
foreach (word_list w in words)
{
richTextBox1.Text += "\t" + w.word;
}
richTextBox1.Text += "\n";
//This loop is slow
for (int i = 0; i < t; i++) //traverse through number of docs
{
richTextBox1.Text += i + 1;
for (int h = 0; h < word_count; h++)//traverse through each distinct word in the list
{
if (words[h].doc[i])
richTextBox1.Text += "\t1";
else
richTextBox1.Text += "\t0";
}
richTextBox1.Text += "\n";
}
}//end of button 2
mediafire.com/?4mojnj4j153q76s:這是正在處理的數據,richtextbox2用於測試目的 – Rishabh876
考慮使用「StringBuilder」構建字符串而不是使用「TextBox.Text」作爲工作空間。 –