2013-01-31 94 views
3

我正在尋找從給定的數據集產生的數據直方圖。我已閱讀了關於構建直方圖的不同選項,並且我最感興趣的是基於數據直方圖 - 優化的binwidth優化

Shimazaki,H。的工作方法。 Shinomoto,S。(2007)。

上述方法使用估計以確定最佳的箱寬度和分佈,這是需要在我的情況下,因爲樣品的數據將在分配和硬變化「A,用於選擇時間直方圖的bin 尺寸方法」預先確定箱數和寬度。

有人可以推薦一個好的來源或起點在c#中編寫這樣的函數或有足夠接近的C#直方圖代碼。

非常感謝。

+1

這看起來非常翔實:http://toyoizumilab.brain.riken.jp/hideaki/res/histogram.html 他們似乎有示例代碼在不同的語言。算法本身似乎是一個相對簡單的優化算法,所以它不能太難以移植到C#。 –

回答

7

以下是我寫的來自here的此算法的Python版本的端口。我知道API可以完成一些工作,但這應該足以讓你開始。對於相同的輸入數據,此代碼的結果與由Python代碼生成的結果完全相同。

public class HistSample 
{ 
    public static void CalculateOptimalBinWidth(double[] x) 
    { 
     double xMax = x.Max(), xMin = x.Min(); 
     int minBins = 4, maxBins = 50; 
     double[] N = Enumerable.Range(minBins, maxBins - minBins) 
      .Select(v => (double)v).ToArray(); 
     double[] D = N.Select(v => (xMax - xMin)/v).ToArray(); 
     double[] C = new double[D.Length]; 

     for (int i = 0; i < N.Length; i++) 
     { 
      double[] binIntervals = LinearSpace(xMin, xMax, (int)N[i] + 1); 
      double[] ki = Histogram(x, binIntervals); 
      ki = ki.Skip(1).Take(ki.Length - 2).ToArray(); 

      double mean = ki.Average(); 
      double variance = ki.Select(v => Math.Pow(v - mean, 2)).Sum()/N[i]; 

      C[i] = (2 * mean - variance)/(Math.Pow(D[i], 2)); 
     } 

     double minC = C.Min(); 
     int index = C.Select((c, ix) => new { Value = c, Index = ix }) 
      .Where(c => c.Value == minC).First().Index; 
     double optimalBinWidth = D[index]; 
    } 

    public static double[] Histogram(double[] data, double[] binEdges) 
    { 
     double[] counts = new double[binEdges.Length - 1]; 

     for (int i = 0; i < binEdges.Length - 1; i++) 
     { 
      double lower = binEdges[i], upper = binEdges[i + 1]; 

      for (int j = 0; j < data.Length; j++) 
      { 
       if (data[j] >= lower && data[j] <= upper) 
       { 
        counts[i]++; 
       } 
      } 
     } 

     return counts; 
    } 

    public static double[] LinearSpace(double a, double b, int count) 
    { 
     double[] output = new double[count]; 

     for (int i = 0; i < count; i++) 
     { 
      output[i] = a + ((i * (b - a))/(count - 1)); 
     } 

     return output; 
    } 
} 

運行這樣的:

double[] x = 
{ 
    4.37, 3.87, 4.00, 4.03, 3.50, 4.08, 2.25, 4.70, 1.73, 
    4.93, 1.73, 4.62, 3.43, 4.25, 1.68, 3.92, 3.68, 3.10, 
    4.03, 1.77, 4.08, 1.75, 3.20, 1.85, 4.62, 1.97, 4.50, 
    3.92, 4.35, 2.33, 3.83, 1.88, 4.60, 1.80, 4.73, 1.77, 
    4.57, 1.85, 3.52, 4.00, 3.70, 3.72, 4.25, 3.58, 3.80, 
    3.77, 3.75, 2.50, 4.50, 4.10, 3.70, 3.80, 3.43, 4.00, 
    2.27, 4.40, 4.05, 4.25, 3.33, 2.00, 4.33, 2.93, 4.58, 
    1.90, 3.58, 3.73, 3.73, 1.82, 4.63, 3.50, 4.00, 3.67, 
    1.67, 4.60, 1.67, 4.00, 1.80, 4.42, 1.90, 4.63, 2.93, 
    3.50, 1.97, 4.28, 1.83, 4.13, 1.83, 4.65, 4.20, 3.93, 
    4.33, 1.83, 4.53, 2.03, 4.18, 4.43, 4.07, 4.13, 3.95, 
    4.10, 2.27, 4.58, 1.90, 4.50, 1.95, 4.83, 4.12 
}; 

HistSample.CalculateOptimalBinWidth(x); 
1

檢查直方圖功能。如果有任何數據元素不符合料倉邊界(第一個或最後一個料倉除外),則它們將在兩個連續料槽中計數。 代碼需要檢查(更低的< = data [j] & & data [j] < upper),並處理所有元素等於xMax的情況進入最後一個bin。

0

我會推薦二進制搜索,以加快班級間隔的分配。

public void Add(double element) 
{ 
    if (element < Bins.First().LeftBound || element > Bins.Last().RightBound) 
    return; 

    var min = 0; 
    var max = Bins.Length - 1; 
    var index = 0; 

    while (min <= max) 
    { 
    index = min + ((max - min)/2); 

    if (element >= Bins[index].LeftBound && element < Bins[index].RightBound) 
     break; 

    if (element < Bins[index].LeftBound) 
     max = index - 1; 
    else 
     min = index + 1; 
    } 

    Bins[index].Count++; 
} 

「垃圾箱」的類型是「HistogramItem」其定義如「Leftbound」,「RightBound」和「計數」屬性的項目的列表。

0

對nick_w回答的小更新。

如果您確實需要後面的垃圾箱。 Plus優化了直方圖功能中的雙重循環,並且除去了linspace功能。

 /// <summary> 
    /// Calculate the optimal bins for the given data 
    /// </summary> 
    /// <param name="x">The data you have</param> 
    /// <param name="xMin">The minimum element</param> 
    /// <param name="optimalBinWidth">The width between each bin</param> 
    /// <returns>The bins</returns> 
    public static int[] CalculateOptimalBinWidth(List<double> x, out double xMin, out double optimalBinWidth) 
    { 
     var xMax = x.Max(); 
     xMin = x.Min(); 
     optimalBinWidth = 0; 
     const int MIN_BINS = 1; 
     const int MAX_BINS = 20; 

     int[] minKi = null; 
     var minOffset = double.MaxValue; 

     foreach (var n in Enumerable.Range(MIN_BINS, MAX_BINS - MIN_BINS).Select(v => v*5)) 
     { 
      var d = (xMax - xMin)/n; 
      var ki = Histogram(x, n, xMin, d); 
      var ki2 = ki.Skip(1).Take(ki.Length - 2).ToArray(); 

      var mean = ki2.Average(); 
      var variance = ki2.Select(v => Math.Pow(v - mean, 2)).Sum()/n; 

      var offset = (2*mean - variance)/Math.Pow(d, 2); 

      if (offset < minOffset) 
      { 
       minKi = ki; 
       minOffset = offset; 
       optimalBinWidth = d; 
      } 
     } 
     return minKi; 
    } 

    private static int[] Histogram(List<double> data, int count, double xMin, double d) 
    { 
     var histogram = new int[count]; 
     foreach (var t in data) 
     { 
      var bucket = (int) Math.Truncate((t - xMin)/d); 
      if (count == bucket) //fix xMax 
       bucket --; 
      histogram[bucket]++; 
     } 
     return histogram; 
    }