2017-04-26 58 views
0

我有以下的代碼,通過數組列表循環(mainItems),並找到最相似的兩個數組,並把它們放在sortedTransactions。對於小數據(10000個事務)它工作正常,但它對於88000個事務永遠運行。可以做些什麼來使其適用於大數據。計算Jaccard相似Java中

import java.util.*; 

    public class Sort { 

    static private List<Transactions> trans = ReadFile.transactions; 
    static public List<int[]> mainItems; 
    static public ArrayList<int[]> sortedTransactions = new ArrayList<int[]>(); 

    static { 
     mainItems = new ArrayList<int[]>(); 

     for (Transactions t : trans) { 
      mainItems.add(t.getItems()); 
     } 
    } 

    static private double jaccardSimilarity(int[] a, int[] b) { 

     Set<Integer> s1 = new LinkedHashSet<Integer>(); 
     for(int i =0; i< a.length; i++){ 
      s1.add(a[i]); 
     } 
     Set<Integer> s2 = new LinkedHashSet<Integer>(); 
     for(int i =0; i< b.length; i++){ 
      s2.add(b[i]); 
     } 

     Set<Integer> intersection = new LinkedHashSet<>(s1); 
     intersection.retainAll(s2); 

     Set<Integer> union = new LinkedHashSet<Integer>(s1); 
     union.addAll(s2); 

     double jaccardSimilarity = (double)intersection.size()/ (double)union.size(); 
     //System.out.println(intersection); 
     return jaccardSimilarity; 
    } 

    static private boolean isAllEqual(List<Double> a){ 

     for(int i=1; i<a.size(); i++){ 
      if(a.get(0) != a.get(i)){ 
       return false; 
      } 
     } 

     return true; 
    } 


    static public void generatePairs() { 

     for (int i = 0; i < mainItems.size() - 1; i++) { 

      if (!sortedTransactions.contains(mainItems.get(i))) { 

       List<Double> myd = new ArrayList<Double>(); 
       List<int[]> mys = new ArrayList<int[]>(); 

       for (int j = i + 1; j < mainItems.size(); j++) { 

        if (!sortedTransactions.contains(mainItems.get(j))) { 

         myd.add(jaccardSimilarity(mainItems.get(i),mainItems.get(j))); 
         mys.add(mainItems.get(j)); 
        } 
       } 

       if (isAllEqual(myd) == false) { 

        sortedTransactions.add(mainItems.get(i)); 
        sortedTransactions.add(mys.get(maxValue(myd))); 
       } 
      } 
     } 
    } 

    static private int maxValue(List<Double> d) { 

     double max = d.get(0); 
     int f = 0; 

     for(int i =1; i< d.size(); i++){ 

      if(d.get(i) > max){ 

       max= d.get(i); 
       f= i; 
      } 
     } 
     return f; 
    } 
} 
+0

不運行這個地方就很難確認,但不是蠻力通過'maxValue'列表迭代,你可以嘗試'Collections'排序就可以把列表以數字順序從最大到最小,然後選擇第一個值。使用Java進行排序非常快速,並且進行了優化。不幸的是,改善具有很多循環的程序的性能往往需要對結構進行全面的反思。以下庫包含一個Jaccard相似性實現,可能有用 - https://github.com/tdebatty/java-string-similarity – Alex

回答

1

您不必創建並集(工會(S1,S2).size()是s1.size()+ s2.size() - 路口(S1,S2).size( ))。

static private double jaccardSimilarity(int[] a, int[] b) { 

    Set<Integer> s1 = new HashSet<Integer>(); 
    for (int i = 0; i < a.length; i++) { 
     s1.add(a[i]); 
    } 
    Set<Integer> s2 = new HashSet<Integer>(); 
    for (int i = 0; i < b.length; i++) { 
     s2.add(b[i]); 
    } 

    final int sa = s1.size(); 
    final int sb = s2.size(); 
    s1.retainAll(s2); 
    final int intersection = s1.size(); 
    return 1d/(sa + sb - intersection) * intersection; 
}