2014-10-03 134 views
0

我有一個函數從一個目錄獲取文件列表,然後從列表中搜索匹配的文件名。性能很糟糕。linq查詢的性能問題

下面是函數:

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type) 
    { 
     XmlConfigurator.Configure(); 
     log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString())); 
     List<fileStatus> results = new List<fileStatus>(); 
     DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory); 
     if (dirInfo.Exists) 
     { 
      // GET LIST OF ALL FILES IN DIRECTORY 
      string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories); 

      log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files))); 


      if (files.Length > 0 && permitNumbers.Count > 0) 
      { 
       log.Debug("Checking for matching files"); 
       // CHECK FOR MATCHING FILES 
       switch (type) 
       { 
        case fileType.Well: 

         var matchingFiles = (from f in files 
              where f.Substring(f.LastIndexOf("\\") + 1).Length > 4 
              where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 5)) 
              select new fileStatus(fileType.Well, f.Substring(f.LastIndexOf("\\") + 1, 5), 1, f.Substring(f.LastIndexOf("\\") + 1))); 


         var permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         var nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
               select new fileStatus(fileType.Well, p, 0, string.Empty)); 

         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.DrillerLog: 
         matchingFiles = (from f in files 
             where f.Substring(f.LastIndexOf("\\") + 1).Length > 4 
             where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 5)) 
             select new fileStatus(fileType.DrillerLog, f.Substring(f.LastIndexOf("\\") + 1, 5), 1, f.Substring(f.LastIndexOf("\\") + 1))); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
              select new fileStatus(fileType.DrillerLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.RasterLog: 

         matchingFiles = (from f in files 
             where f.Substring(f.LastIndexOf("\\") + 1).Length > 13 
             where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 14)) 
             select new fileStatus(fileType.RasterLog, f.Substring(f.LastIndexOf("\\") + 1, 14), 1, f.Substring(f.LastIndexOf("\\") + 1))); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
              select new fileStatus(fileType.RasterLog, p, 0, string.Empty)); 



         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 
         break; 
        default: 
         break; 
       } 
       log.Debug("Done checking for matching files"); 
      } 
     } 
     return results; 

    } 

一旦它到達LINQ查詢,對「matchingFiles」提供的價值,它只是掛起。這是一個大的「許可證號碼」(如5000),也是一大組「文件」。

我能做些什麼來加快速度?

考慮到下面提供的建議,我將功能修改爲如下,現在性能按預期工作。非常感謝你! =)

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type) 
    { 
     HashSet<string> numbers = new HashSet<string>(permitNumbers); 
     XmlConfigurator.Configure(); 
     log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString())); 
     List<fileStatus> results = new List<fileStatus>(); 
     DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory); 
     if (dirInfo.Exists) 
     { 
      // GET LIST OF ALL FILES IN DIRECTORY 
      string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories); 
      HashSet<string> fileNames = new HashSet<string>(files.Select(f => Path.GetFileName(f))); 

      log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files))); 


      if (fileNames.Count > 0 && numbers.Count > 0) 
      { 
       log.Debug("Checking for matching files"); 
       // CHECK FOR MATCHING FILES 
       switch (type) 
       { 
        case fileType.Well: 
         var matchingFiles = (from f in fileNames 
              where f.Length > 4 
              where numbers.Contains(f.Substring(0, 5)) 
              select new fileStatus(fileType.Well, f.Substring(0, 5), 1, f)); 


         var permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         var nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty)); 

         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.DrillerLog: 
         matchingFiles = (from f in fileNames 
             where f.Length > 4 
             where numbers.Contains(f.Substring(0, 5)) 
             select new fileStatus(fileType.DrillerLog, f.Substring(0, 5), 1, f)); 


         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.DrillerLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.RasterLog: 

         matchingFiles = (from f in fileNames 
             where f.Length > 13 
             where numbers.Contains(f.Substring(0, 14)) 
             select new fileStatus(fileType.RasterLog, f.Substring(0, 14), 1, f)); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.RasterLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 
         break; 
        default: 
         break; 
       } 
       log.Debug("Done checking for matching files"); 
      } 
     } 
     return results; 

    } 
+0

「一旦它到達LINQ查詢」 哪一個?你有幾個。另外,5000並不是一個「非常大的集合」。 – 2014-10-03 15:06:56

+1

你是否分析了代碼?哪個linq查詢很慢? – 2014-10-03 15:07:06

+0

只要它擊中提供「matchingFiles」值的linq查詢。 – 2014-10-03 15:08:07

回答

2

你正在創建一個查詢matchingFiles,其中,迭代時,將通過你的所有文件的迭代,以多種方式操縱他們,也做的線性搜索你的數字集合。然後,您執行該查詢並執行該操作(需要反覆從磁盤讀取大量數據,如果您有足夠的高速緩存),並執行線性搜索對於每個的許可證號碼。這導致O(N^2 * M)的漸近複雜度,其中N是許可證號碼的數目,M是文件的數目。這是...非常糟糕。

這裏的關鍵是避免1)進行線性搜索和2)多次迭代複雜查詢,特別是避免對其他序列中的每個項目進行迭代。

對於#1,只是讓permitNumbers a HashSet<string>而不是一個列表,然後檢查一個項目是否包含在它將成爲一個O(1)操作。

var nonMatchingFiles = permitNumbers.Except(permitNumbersWithMatches) 
    .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty)); 
1

我將消除所有f.Substring一再呼籲(f.LastIndexOf(「\」:

#2與只需要迭代源序列一旦操作取代第三個查詢)+ 1))一起Path.GetFileName單個調用(F)

例如

var fileNames = files.Select(f => Path.GetFileName(f));  

var matchingFiles = (from fname in fileNames 
        where fname.Length > 4 
        where permitNumbers.Contains(fname.Substring(0, 5)) 
        select new fileStatus(fileType.Well, fname.Substring(0, 5), 1, fname); 
+0

這會提高代碼的可讀性,但它不可能對性能產生有意義的影響。 – Servy 2014-10-03 16:06:45