2016-08-18 67 views
-2

我有大量的目錄類型的數據,如電子郵件,電話,郵編等特性我想檢測和重複組上,但所有的多個屬性,其中基團可以由對不同屬性的重複,但不是全部都必須是對方的直接重複,但可能通過另一行相關。複雜分組

示例數據:

Name  |  Email  |  Tel  |  Postcode 
John Sim |  [email protected] |  111  |  C67 
J Sim  |    |  111  | 
John S  |    |    |  C67 

我希望能夠找到一個方式,我可以檢測在電子郵件,電話和郵編性重複,但即使他們這樣做不是所有的比賽。所以在上面的例子中,我會得到上述所有3個分組,儘管最後一個沒有與中間分組直接匹配,但是第一個分組與兩個匹配。

如果是有道理的!顯然,這是非常簡單的,我有一個100的記錄,我試圖以一種方式,我可以顯示重複組。

到目前爲止,我已經發現了一些非常低效的方法,包括檢查每一行,分組任何直接重複,然後檢查是否有任何重複項也有重複項,如果是這樣,將它們全部移到一起新組。但我試圖找到一些靈感和做:)

感謝的更有效的方法!

+0

是什麼讓這3條記錄重複,我只看到2條是合格的? – Aybe

+0

@Aybe這聽起來像因爲first.Tel == second.Tel和first.Postcode == third.Postcode。 – itsme86

+0

如果最後一條記錄的電話號碼是999,那該怎麼辦?仍然會被認爲是重複的嗎? –

回答

0

你可以在foreach循環中使用索引和每次迭代的結合上像下面讓複雜性降至O(N):

 foreach (var entry in list) 
     { 
      Group emailGroup = null; 
      Group telGroup = null; 
      Group postcodeGroup = null; 

      if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup)) 
       if (!emailGroup.Add(entry)) emailGroup = null; 
      if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup)) 
       if (!telGroup.Add(entry)) telGroup = null; 
      if (entry.Postcode != null && _postcodeGroups.TryGetValue(entry.Postcode, out postcodeGroup)) 
       if (!postcodeGroup.Add(entry)) postcodeGroup = null; 

      if (emailGroup == null && telGroup == null && postcodeGroup == null) 
      { 
       CreateGroup(entry); 
       continue; 
      } 

      CombineGroups(emailGroup, telGroup, postcodeGroup); 
     } 

當然,你必須決定+處理你想要什麼如果需要,添加任何名稱邏輯(例如拆分名+中間+最後),然後做一個2路包含每個(相當昂貴,所以可能想看字符串索引)

完整的代碼+測試

見方法

[Test] 
public void Test() 

下面。

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using FluentAssertions; 
using NUnit.Framework; 

namespace StackOverflow 
{ 
    [TestFixture] 
    public class Class1 
    { 
     private Dictionary<string, Group> _emailGroups; 
     private Dictionary<string, Group> _telGroups; 
     private Dictionary<string, Group> _postcodeGroups; 

     private void CreateGroup(Entry entry) 
     { 
      var group = new Group(entry); 

      if (group.Email != null && !_emailGroups.ContainsKey(group.Email)) 
       _emailGroups[group.Email] = group; 

      if (group.Tel != null && !_emailGroups.ContainsKey(group.Tel)) 
       _telGroups[group.Tel] = group; 

      if (group.PostCode != null && !_emailGroups.ContainsKey(group.PostCode)) 
       _postcodeGroups[group.PostCode] = group; 
     } 

     private void CombineGroups(Group emailGroup, Group telGroup, Group postcodeGroup) 
     { 
      if (emailGroup != telGroup && emailGroup != null && telGroup != null) 
      { 
       if (emailGroup.CanCombine(telGroup)) 
       { 
        emailGroup.Add(telGroup); 
        UpdateIndexes(emailGroup, telGroup); 
        telGroup = null; 
       } 
       ; 
      } 

      if (emailGroup != postcodeGroup && emailGroup != null && postcodeGroup != null) 
      { 
       if (emailGroup.CanCombine(postcodeGroup)) 
       { 
        emailGroup.Add(postcodeGroup); 
        UpdateIndexes(emailGroup, postcodeGroup); 
        postcodeGroup = null; 
       } 
       ; 
      } 

      if (telGroup != postcodeGroup && telGroup != null && postcodeGroup != null) 
      { 
       if (telGroup.CanCombine(postcodeGroup)) 
       { 
        telGroup.Add(postcodeGroup); 
        UpdateIndexes(telGroup, postcodeGroup); 
        postcodeGroup = null; 
       } 
       ; 
      } 
     } 

     private void UpdateIndexes(Group newGroup, Group oldGroup) 
     { 
      Group group; 
      if (oldGroup.Email != null 
       && _emailGroups.TryGetValue(oldGroup.Email, out group) 
       && group == oldGroup) 
       _emailGroups[oldGroup.Email] = newGroup; 

      if (oldGroup.Tel != null 
       && _telGroups.TryGetValue(oldGroup.Tel, out group) 
       && group == oldGroup) 
       _telGroups[oldGroup.Tel] = newGroup; 

      if (oldGroup.PostCode != null 
       && _postcodeGroups.TryGetValue(oldGroup.PostCode, out group) 
       && group == oldGroup) 
       _postcodeGroups[oldGroup.PostCode] = newGroup; 
     } 

     public class Group 
     { 
      public HashSet<Entry> Entries = new HashSet<Entry>(); 

      public Group(Entry entry) 
      { 
       Email = entry.Email; 
       Tel = entry.Tel; 
       PostCode = entry.PostCode; 
       Entries.Add(entry); 
      } 

      public string Email { get; set; } 
      public string Tel { get; set; } 
      public string PostCode { get; set; } 

      public bool Matches(Entry entry) 
      { 
       if (Email != null && entry.Email != null && entry.Email != Email) 
        return false; 

       if (Tel != null && entry.Tel != null && entry.Tel != Tel) 
        return false; 

       if (PostCode != null && entry.PostCode != null && entry.PostCode != PostCode) 
        return false; 

       return true; 
      } 

      public bool Add(Entry entry) 
      { 
       if (!Matches(entry)) 
        return false; 

       Entries.Add(entry); 

       if (Email == null && entry.Email != null) 
        Email = entry.Email; 

       if (Tel == null && entry.Tel != null) 
        Tel = entry.Tel; 

       if (PostCode == null && entry.PostCode != null) 
        PostCode = entry.PostCode; 

       return true; 
      } 

      public bool CanCombine(Group entry) 
      { 
       if (Email != null && entry.Email != null && Email != entry.Email) 
        return false; 

       if (Tel != null && entry.Tel != null && Tel != entry.Tel) 
        return false; 

       if (PostCode != null && entry.PostCode != null && PostCode != entry.PostCode) 
        return false; 

       return true; 
      } 

      public void Add(Group group) 
      { 
       foreach (var entry in group.Entries) 
       { 
        Add(entry); 
       } 
      } 

      public override string ToString() 
      { 
       var sb = new StringBuilder(); 

       sb.AppendLine($"Key: {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}"); 

       foreach (var entry in Entries) 
       { 
        sb.AppendLine(entry.ToString()); 
       } 

       return sb.ToString(); 
      } 
     } 

     public class Entry 
     { 
      public Entry(string name, string email, string tel, string postCode) 
      { 
       Name = name; 
       Email = email; 
       Tel = tel; 
       PostCode = postCode; 
      } 

      public string Name { get; set; } 
      public string Email { get; set; } 
      public string Tel { get; set; } 
      public string PostCode { get; set; } 

      public override string ToString() 
      { 
       return $"Entry: {Name ?? "null"} | {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}"; 
      } 
     } 

     [Test] 
     public void Test() 
     { 
      var list = new List<Entry> 
      { 
       new Entry("John S", null, null, "C67"), 
       new Entry("J Sim", null, "111", null), 
       new Entry("John Sim", "[email protected]", "111", "C67") 
      }; 

      _emailGroups = new Dictionary<string, Group>(); 
      _telGroups = new Dictionary<string, Group>(); 
      _postcodeGroups = new Dictionary<string, Group>(); 

      foreach (var entry in list) 
      { 
       Group emailGroup = null; 
       Group telGroup = null; 
       Group postcodeGroup = null; 

       if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup)) 
        if (!emailGroup.Add(entry)) emailGroup = null; 
       if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup)) 
        if (!telGroup.Add(entry)) telGroup = null; 
       if (entry.PostCode != null && _postcodeGroups.TryGetValue(entry.PostCode, out postcodeGroup)) 
        if (!postcodeGroup.Add(entry)) postcodeGroup = null; 

       if (emailGroup == null && telGroup == null && postcodeGroup == null) 
       { 
        CreateGroup(entry); 
        continue; 
       } 

       CombineGroups(emailGroup, telGroup, postcodeGroup); 
      } 

      var groups = _emailGroups.Select(x => x.Value) 
       .Union(_telGroups.Select(x => x.Value)) 
       .Union(_postcodeGroups.Select(x => x.Value)) 
       .Distinct() 
       .ToList(); 

      foreach (var grp in groups) 
      { 
       Console.WriteLine(grp.ToString()); 
      } 

      groups.Should().HaveCount(1); 

      groups.First().Entries.Should().HaveCount(3); 
     } 
    } 
}