2012-10-12 147 views
8

選擇的行的不同組我有表t_stats用柱id (INT)和列ratio (DECIMAL(8,4))id是獨一無二的。根據平均

我想要查詢表t_stats以便選擇3個組具有相同AVG(ratio)(儘可能最接近)。

可以使用臨時表來完成,只要我可以運行它作爲一個腳本或存儲過程。


編輯:下面是具體的例子:

輸入:

id ratio 
-- ----- 
24 0.930000 
25 0.390000 
26 0.800000 
27 0.920000 
28 0.550000 
30 0.810000 
31 0.770000 
32 0.800000 
33 0.590000 
36 0.760000 
37 0.910000 
40 0.690000 
43 0.390000 
45 0.310000 
46 0.760000 
47 0.710000 
54 0.710000 
55 0.950000 
57 0.920000 
60 0.890000 
62 0.700000 
66 0.890000 
68 0.950000 
107 0.760000 
559 0.990000 
560 0.540000 
565 0.430000 
566 0.830000 
568 0.590000 
579 0.970000 
599 0.900000 
623 0.450000 
749 0.800000 
750 0.970000 
753 0.820000 
754 0.730000 
766 0.620000 
768 0.430000 
770 0.790000 
838 0.700000 
875 0.835000 
987 0.900000 
988 0.740000 
1157 0.850000 
1250 0.630000 
1328 0.860000 
2171 0.900000 
2176 0.520000 
2177 0.980000 
2178 0.940000 
2180 0.970000 
2184 0.990000 
2187 0.950000 
2188 0.940000 
2189 0.920000 
2195 0.990000 
2233 0.900000 
2234 0.940000 
2235 0.950000 
2240 0.980000 
2243 0.920000 
2253 0.900000 
2266 0.530000 
2269 0.920000 
2270 0.970000 
2271 0.750000 
2272 0.820000 
2275 0.910000 
2277 0.930000 
2281 0.690000 
2282 0.710000 
2288 0.840000 
2528 0.870000 
2778 0.950000 
2814 0.990000 

OUTPUT:

groupId id  ratio 
------- --  ----- 
1  24  0.930000 
1  25  0.390000 
1  27  0.920000 
1  30  0.810000 
1  32  0.800000 
1  36  0.760000 
1  54  0.710000 
1  60  0.890000 
1  559  0.990000 
1  560  0.540000 
1  566  0.830000 
1  568  0.590000 
1  623  0.450000 
1  750  0.970000 
1  838  0.700000 
1  987  0.900000 
1  1157  0.850000 
1  2178  0.940000 
1  2180  0.970000 
1  2253  0.900000 
1  2269  0.920000 
1  2271  0.750000 
1  2281  0.690000 
1  2778  0.950000 
1  2814  0.990000 
2  26  0.800000 
2  28  0.550000 
2  31  0.770000 
2  40  0.690000 
2  45  0.310000 
2  55  0.950000 
2  57  0.920000 
2  66  0.890000 
2  107  0.760000 
2  565  0.430000 
2  579  0.970000 
2  753  0.820000 
2  754  0.730000 
2  766  0.620000 
2  875  0.835000 
2  1328  0.860000 
2  2176  0.520000 
2  2177  0.980000 
2  2184  0.990000 
2  2187  0.950000 
2  2189  0.920000 
2  2233  0.900000 
2  2234  0.940000 
2  2275  0.910000 
2  2282  0.710000 
3  33  0.590000 
3  37  0.910000 
3  43  0.390000 
3  46  0.760000 
3  47  0.710000 
3  62  0.700000 
3  68  0.950000 
3  599  0.900000 
3  749  0.800000 
3  768  0.430000 
3  770  0.790000 
3  988  0.740000 
3  1250  0.630000 
3  2171  0.900000 
3  2188  0.940000 
3  2195  0.990000 
3  2235  0.950000 
3  2240  0.980000 
3  2243  0.920000 
3  2266  0.530000 
3  2270  0.970000 
3  2272  0.820000 
3  2277  0.930000 
3  2288  0.840000 
3  2528  0.870000 

所以我想使3組n值和瞄準特定的平均值x。 (爲例與n=300.75 < x < 0.85會是什麼樣子3組30個值中的每個,其中每個組具有0.75 < AVG(ratio) < 0.85和一個id只能屬於1組。)

所以平均是每個組中的幾乎相同,並且靠近x

groupId  avg(ratio) 
-------  ---------- 
1   0.805600 
2   0.789000 
3   0.797600 
+0

這就像旅行推銷員問題。您需要比較所有組合並挑選出最佳表演者。即使是中等大小的數據集,它也是數量龐大的組合!除非你很樂意做一個天真的估計? * [按大小排序,第1-3行到第1-3組,然後第4-6行到第1-3組等等] * – MatBailie

+0

目前這是我使用的。但我希望有一個不那麼天真的解決方案。 –

+0

您需要查看優化算法。 SQL不是最好的環境。 – MatBailie

回答

3

這裏是一個T-SQL程序版本是有點像draft,只爲了草案每輪根據需要優化。

如果要挑選所有項目,這種「競爭」性質似乎會導致略低於完美的比率,但最重要的是您基本上擁有O(N^2)算法,因爲它本質上是min函數在一個循環中(考慮到group by子句,這可能是樂觀的)。這也是確定性的,如果有必要的話,在另一個層次上實現應該相當簡單。

-- SET THESE! 
declare @numberOfGroups int = 3 
declare @itemsPerGroup int = 25 
declare @targetRatio decimal(8,4) = .8 
-- /SET 

set nocount on 

-- Create a table of items 
declare @t_stats table (
     id int not null primary key 
    , ratio decimal(8,4) not null 
    , grp int null 
) 
insert into @t_stats (id, ratio) values 
    (24,0.930000), (25,0.390000), (26,0.800000), 
    (27,0.920000), (28,0.550000), (30,0.810000), 
    (31,0.770000), (32,0.800000), (33,0.590000), 
    (36,0.760000), (37,0.910000), (40,0.690000), 
    (43,0.390000), (45,0.310000), (46,0.760000), 
    (47,0.710000), (54,0.710000), (55,0.950000), 
    (57,0.920000), (60,0.890000), (62,0.700000), 
    (66,0.890000), (68,0.950000), (107,0.760000), 
    (559,0.990000), (560,0.540000), (565,0.430000), 
    (566,0.830000), (568,0.590000), (579,0.970000), 
    (599,0.900000), (623,0.450000), (749,0.800000), 
    (750,0.970000), (753,0.820000), (754,0.730000), 
    (766,0.620000), (768,0.430000), (770,0.790000), 
    (838,0.700000), (875,0.835000), (987,0.900000), 
    (988,0.740000), (1157,0.850000), (1250,0.630000), 
    (1328,0.860000), (2171,0.900000), (2176,0.520000), 
    (2177,0.980000), (2178,0.940000), (2180,0.970000), 
    (2184,0.990000), (2187,0.950000), (2188,0.940000), 
    (2189,0.920000), (2195,0.990000), (2233,0.900000), 
    (2234,0.940000), (2235,0.950000), (2240,0.980000), 
    (2243,0.920000), (2253,0.900000), (2266,0.530000), 
    (2269,0.920000), (2270,0.970000), (2271,0.750000), 
    (2272,0.820000), (2275,0.910000), (2277,0.930000), 
    (2281,0.690000), (2282,0.710000), (2288,0.840000), 
    (2528,0.870000), (2778,0.950000), (2814,0.990000) 

-- Create a table of groups 
declare @groups table (
    grp int not null primary key identity 
) 
while (select isnull(max(grp), 0) from @groups) < @numberOfGroups begin 
    insert into @groups default values 
end 

-- Check that we have enough items to fill all groups 
if @numberOfGroups * @itemsPerGroup <= (select count(*) from @t_stats) begin 

    -- Groups now pick the best-fitting items one at a time 
    while (select count(*) from @t_stats where grp is not null) < (select count(*) * @itemsPerGroup from @groups) begin 
     declare @grp int, @Num int, @ratio decimal(8,4), @id int 

     -- Find the group with the least number of items or the worst ratio 
     select top 1 @grp = grp, @Num = Num, @ratio = ratio 
     from (
      select g.grp 
       , count(i.grp) as Num 
       , isnull(avg(i.ratio), 0.0) as ratio 
       , abs(@targetRatio - avg(i.ratio)) as RatioDist 
      from @groups g 
       left join @t_stats i on g.grp = i.grp 
      group by g.grp 
     ) as a 
     order by Num, RatioDist, grp 

     -- Let that group make their best pick 
     select top 1 @id = id 
     from (
      select id 
       , abs(((ratio + (@ratio * @Num))/(@Num + 1)) - @targetRatio) as NewRatioDist 
      from @t_stats 
      where grp is null 
     ) as a 
     order by NewRatioDist 

     -- Update the items table based upon the pick 
     update @t_stats set grp = @grp where id = @id 

    end 

end 
else begin 
    -- Not enought items 
    raiserror('Too many groups or items per group.', 17, 0) 
end 

-- Display the results 
select grp, count(*) as Num, avg(ratio) as ratio 
from @t_stats 
group by grp 
order by grp 
+0

謝謝蒂姆...工程偉大,相對較快!良好的工作 –

+0

謝謝,儘管這裏有很多優化的空間。如果你想做的更好,請更新我們。 –

2

試試這個

Declare @t Table (Id Int, Ratio DECIMAL(8,2)) 
Insert Into @t Values(1,0.5),(2,0.55),(3,0.97),(4,0.77),(5,0.97),(6,0.99),(7,1.0),(8,0.15),(9,0.33) 

DECLARE @MeanSum DECIMAL(8,2) 
SELECT @MeanSum =SUM(Ratio)/3 FROM @T 

;WITH Cte (Id,Ratio,Ids,RatioValues,RatioTotalWeight,Level) AS 
( 
    SELECT Id 
      ,Ratio    
      , ',' + CAST(Id AS VARCHAR(MAX)) 
      ,',' + CAST(Ratio AS VARCHAR(MAX)) 
      ,CAST(Ratio AS DECIMAL(8,2)) 
      ,1 
    FROM @t 
    UNION ALL 
    SELECT   
      p.Id 
      , p.Ratio    
      ,c.Ids + ',' + CAST(p.Id AS VARCHAR(MAX)) 
      ,c.RatioValues + ',' + CAST(p.Ratio AS VARCHAR(MAX)) 
      ,CAST(c.RatioTotalWeight + p.Ratio AS DECIMAL(8,2)) 
      ,c.Level+1   
    FROM @t AS p JOIN Cte c ON p.Id < c.Id 
    WHERE c.Level < 3 
),CTEOf3Groups AS( 
    SELECT 
     Ids = STUFF(Ids,1,1,'') 
     ,RatioValues 
     ,RatioTotalWeight 
     , FirstChar = SUBSTRING(STUFF(Ids,1,1,''),0,CHARINDEX(',',STUFF(Ids,1,1,''))) 
     ,DENSE_RANK() OVER(ORDER BY ABS(RatioTotalWeight - @MeanSum)) [rank] -- gets the closest distance 
    FROM CTE  
),CteGetTheRanks AS( 
Select *, Rn = Row_Number() Over(Partition By FirstChar Order by FirstChar, [Rank]) 
From CTEOf3Groups) 
,CteGroups AS( 
SELECT [GroupId] = Row_Number() Over(Order By (Select 1)), Ids,[Rank] 
FROM CteGetTheRanks 
Where [Rank]<=3 
AND Rn = 1) 

SELECT X.[GroupId],X.Id,t.Ratio 
FROM 
    ( 
     SELECT F1.[GroupId], 
     O.splitdata AS ID 
     FROM 
      ( 
       SELECT *, 
       CAST('<X>'+REPLACE(F.Ids,',','</X><X>')+'</X>' AS XML) AS xmlfilter 
       FROM CteGroups F 
      )F1 
     CROSS APPLY 
     ( 
      SELECT fdata.D.value('.','varchar(50)') AS splitdata 
      FROM f1.xmlfilter.nodes('X') As fdata(D) 
     ) O 
    )X JOIN @t t ON t.Id = X.ID 
ORDER BY 1,2 
OPTION (MAXRECURSION 0) 

結果 enter image description here

編輯 我與您提供的sampel數據(DDL provied參考)

Declare @t Table (Id Int, Ratio DECIMAL(8,2)) 
Insert Into @t Values 
(52,0.930000),(53,0.390000),(54,0.800000),(55,0.920000),(56,0.550000), 
(58,0.810000),(59,0.770000),(60,0.800000),(61,0.590000),(64,0.760000), 
(65,0.910000),(68,0.690000),(71,0.390000),(73,0.310000),(74,0.760000), 
(75,0.710000),(82,0.710000),(83,0.950000),(85,0.920000),(88,0.890000), 
(90,0.700000),(94,0.890000),(96,0.950000),(135,0.760000),(587,0.990000), 
(588,0.540000),(593,0.430000),(594,0.830000),(596,0.590000),(607,0.970000), 
(627,0.900000),(651,0.450000),(777,0.800000),(778,0.970000),(781,0.820000), 
(782,0.730000),(794,0.620000),(796,0.430000),(798,0.790000),(866,0.700000), 
(903,0.835000),(1015,0.900000),(1016,0.740000),(1185,0.850000),(1278,0.630000), 
(1356,0.860000),(2199,0.900000),(2204,0.520000),(2205,0.980000),(2206,0.940000), 
(2208,0.970000),(2212,0.990000),(2215,0.950000),(2216,0.940000),(2217,0.920000), 
(2223,0.990000),(2261,0.900000),(2262,0.940000),(2263,0.950000),(2268,0.980000), 
(2271,0.920000),(2281,0.900000),(2294,0.530000),(2297,0.920000),(2298,0.970000), 
(2299,0.750000),(2300,0.820000),(2303,0.910000),(2305,0.930000),(2309,0.690000), 
(2310,0.710000),(2316,0.840000),(2556,0.870000),(2806,0.950000),(2842,0.990000), 
(2844,0.710000),(2977,0.730000),(2985,0.960000),(3008,0.710000),(3042,0.910000), 
(3061,0.830000),(3243,0.900000),(3346,0.800000),(3371,0.800000),(3497,0.990000), 
(3838,0.730000),(4000,0.980000),(4001,0.890000),(4002,0.850000),(4003,0.490000), 
(4004,0.970000),(4009,0.930000),(4032,0.930000),(4095,0.460000),(4428,0.610000), 
(4438,0.960000),(4439,0.930000),(4445,0.650000),(4446,0.660000),(4447,0.490000), 
(4455,0.880000),(4457,0.890000),(4460,0.980000),(4469,0.930000),(4473,0.980000), 
(4474,0.950000),(4475,0.940000),(4481,0.400000),(4489,0.760000),(4490,0.470000) 

而結果試圖爲

enter image description here

所執行的時間是27秒。請從您的結束(也是結果)測試,讓我知道。

編輯

75記錄DDL

Declare @t Table (Id Int, Ratio DECIMAL(8,4)) 
Insert Into @t Values 
(24,0.930000),(25,0.390000),(26,0.800000),(27,0.920000), 
(28,0.550000),(30,0.810000),(31,0.770000),(32,0.800000), 
(33,0.590000),(36,0.760000),(37,0.910000),(40,0.690000), 
(43,0.390000),(45,0.310000),(46,0.760000),(47,0.710000), 
(54,0.710000),(55,0.950000),(57,0.920000),(60,0.890000), 
(62,0.700000),(66,0.890000),(68,0.950000),(107,0.760000), 
(559,0.990000),(560,0.540000),(565,0.430000),(566,0.830000), 
(568,0.590000),(579,0.970000),(599,0.900000),(623,0.450000), 
(749,0.800000),(750,0.970000),(753,0.820000),(754,0.730000), 
(766,0.620000),(768,0.430000),(770,0.790000),(838,0.700000), 
(875,0.835000),(987,0.900000),(988,0.740000),(1157,0.850000), 
(1250,0.630000),(1328,0.860000),(2171,0.900000),(2176,0.520000), 
(2177,0.980000),(2178,0.940000),(2180,0.970000),(2184,0.990000), 
(2187,0.950000),(2188,0.940000),(2189,0.920000),(2195,0.990000), 
(2233,0.900000),(2234,0.940000),(2235,0.950000),(2240,0.980000), 
(2243,0.920000),(2253,0.900000),(2266,0.530000),(2269,0.920000), 
(2270,0.970000),(2271,0.750000),(2272,0.820000),(2275,0.910000), 
(2277,0.930000),(2281,0.690000),(2282,0.710000),(2288,0.840000), 
(2528,0.870000),(2778,0.950000),(2814,0.990000) 
+0

我無法用我的數據測試您的解決方案(110行而不是9,WITH子句的行數太多......)。 '聲明終止。在聲明完成之前,最大遞歸100已經用完了。' –

+0

我很想嘗試它並查看結果,但是使用75行表已經運行了20分鐘,並且仍在運行。也許有人會優化你的解決方案? –

+0

能否請你給我的數據與尊重output.I將優化我的解決方案,並會回來給你.1技巧雖然...而是將結果存儲在一些中間臨時表創建索引,並嘗試了。 –

1

SQL真的不適合這類問題的最佳工具。

但是,有時使用TSQL錘擊一些螺絲是有趣的!

下面是沾到75行示例數據如下努力:

GroupId  Average         Count 
    ----------- --------------------------------------- ----------- 
    1   0.798400        25 
    2   0.796600        25 
    3   0.797200        25 

在我的機器上的第二下。
只是一個警告:這種方法有很大的缺陷,但如果你需要在SQL中做到這一點,你可能會向他們發送一些磁帶,但我沒有時間。

-- **Expects data in table t_stats (id, ratio)** 
if OBJECT_ID('tempdb..#data') is not null drop table #data 
if OBJECT_ID('tempdb..#pairsets') is not null drop table #pairsets 
if OBJECT_ID('tempdb..#pairseed') is not null drop table #pairseed 
if OBJECT_ID('tempdb..#match') is not null drop proC#match 
go 

-- rather horrible routine using dsql to find either: 
-- 1) groups of values that sum to exactly @targetsum (only if @targetsum non null) 
-- 2) the group containing the least values that includes data id @includeid and where the sum is within +- @targetsumrange 
create proC#match(@targetsum DECIMAL(8,4), @includeid int, @targetsumrange DECIMAL(8,4)) as 
begin 
    set nocount on 
    declare @nearestmatch bit = 0 
    if @targetsum is null set @nearestmatch = 1 
    declare @combination table (value int, asstring varchar(10), alias varchar(50)) 
    declare @savedpairseed int = (select pairseed from #pairseed) 

    declare @stmtTemplate varchar(max) = 'declare @pairseed int = (select pairseed from #pairseed) 
    declare @DistSum DECIMAL(8,4) 
    <DeclareVars> 
    declare candloop cursor for select <SelectList>, <DistanceSum> as Dist_sum from <TableList> where <IdCheck> <SumCheck> 
    open candloop 
    fetch next from candloop into <VarsList>, @DistSum 
    while @@fetch_status = 0 
    begin 
    if (select count(*) from #data where id in (<VarsList>)) = <VarsCount> 
    begin 
     <DeleteData> 
     <InsertPairs> 
     set @pairseed = @pairseed + 1 
    end 
    fetch next from candloop into <VarsList>, @DistSum 
    end 
    close candloop 
    deallocate candloop 
    update #pairseed set pairseed = @pairseed ' 

    declare @combinations int = 1 
    declare @maxcombinations int = 8 
    while @combinations <= @maxcombinations 
    begin 
    insert @combination select @combinations, cast(@combinations as varchar(10)), char(ascii('a') + @combinations-1) 
    declare @DeclareVars varchar(max) = '' 
    declare @SelectList varchar(max) = '' 
    declare @TableList varchar(max) = '' 
    declare @IdCheck varchar(max) = '' 
    declare @DistanceSum varchar(max) = '' 
    declare @InsertPairs varchar(max) = '' 
    declare @VarsList varchar(max) = '' 
    declare @SumCheck varchar(max) = '' 
    declare @DeleteData varchar(max) = 'delete #data where id in (<VarsList>)' 

    select @DeclareVars = @DeclareVars + 'declare @id'+asstring+ ' int ' from @combination 
    select @SelectList = @SelectList + alias +'.id, ' from @combination 
    set @SelectList = SUBSTRING(@selectlist, 1, LEN(@SelectList)-1) 
    select @TableList = @TableList + '#data '+alias+', ' from @combination 
    set @TableList = SUBSTRING(@TableList, 1, LEN(@TableList)-1) 
    select @IdCheck = @IdCheck + a.alias+'.id < '+b.alias+'.id and ' 
    from @combination a join @combination b on a.value+1 = b.value 
    if LEN(@IdCheck) > 4 
     set @IdCheck = SUBSTRING(@IdCheck, 1, LEN(@IdCheck)-4) + ' and ' 
    select @DistanceSum = @DistanceSum + alias+'.targetdistance + ' from @combination 
    set @DistanceSum = SUBSTRING(@DistanceSum, 1, LEN(@DistanceSum)-2) 
    select @VarsList = @VarsList + '@id'+asstring+ ', ' from @combination 
    set @VarsList = SUBSTRING(@VarsList, 1, LEN(@VarsList)-1) 
    select @InsertPairs = @InsertPairs + 'insert #pairsets select @pairseed, @id'+asstring+ ', @DistSum'+ CHAR(10) from @combination 
    set @SumCheck = @DistanceSum + ' = '+ cast(@Targetsum as varchar(20)) 

    if @nearestmatch = 1 
    begin 
     set @SumCheck = '(' 
     select @SumCheck = @SumCheck + alias+'.id = '+CAST(@includeid as varchar(10))+' or ' from @combination 
     if LEN(@SumCheck) > 4 
     set @SumCheck = SUBSTRING(@SumCheck, 1, LEN(@SumCheck)-3) 
     set @SumCheck = @SumCheck + ')' 
     set @DeleteData = '' 
    end 

    declare @stmt varchar(max) 
    set @stmt = REPLACE(@stmtTemplate, '<DeclareVars>', @DeclareVars) 
    set @stmt = REPLACE(@stmt, '<DeleteData>', @DeleteData) 
    set @stmt = REPLACE(@stmt, '<SelectList>', @SelectList) 
    set @stmt = REPLACE(@stmt, '<TableList>', @TableList) 
    set @stmt = REPLACE(@stmt, '<IdCheck>', @IdCheck) 
    set @stmt = REPLACE(@stmt, '<DistanceSum>', @DistanceSum) 
    set @stmt = REPLACE(@stmt, '<InsertPairs>', @InsertPairs) 
    set @stmt = REPLACE(@stmt, '<VarsList>', @VarsList) 
    set @stmt = REPLACE(@stmt, '<VarsCount>', cast(@combinations as varchar(10))) 
    set @stmt = REPLACE(@stmt, '<SumCheck>', @SumCheck)  
    exec (@stmt)  
    set @combinations = @combinations + 1 
    end 

    if @nearestmatch = 1 
    begin 
    -- above will have recorded all possible matches within range 
    -- remove all but the closest and reindex the pair ids 
    declare @bestmatch int 
    select top 1 @bestmatch = pairid from #pairsets where pairid >= @savedpairseed and ABS(distsum) < @targetsumrange 
    delete #pairsets where pairid >= @savedpairseed and pairid <> ISNULL(@bestmatch, -1) 
    delete #data where id in (select id from #pairsets where pairid = @bestmatch) 
    update #pairsets set pairid = @savedpairseed where pairid = @bestmatch 
    update #pairseed set pairseed = @savedpairseed+1 
    end 

end 

go 
set nocount on 
-- set the parameters 
declare @xmin DECIMAL(8,4) = 0.75 
declare @xmax DECIMAL(8,4) = 0.85 
declare @xrange DECIMAL(8,4) = @xmax - @xmin 
declare @xtarg DECIMAL(8,4) = (@[email protected])/2 
declare @ngroups int = 3 
declare @targetgroupsize int = 25 

declare @maxbalancedpair int 

-- copy the ratio data (using 75 row data from updated question) 
select *, [email protected] as targetdistance, abs(ratio - @xtarg) as targetdistanceabsolute into #data from t_stats 
create table #pairseed (pairseed int) 
create table #pairsets (pairid int, id int, distsum DECIMAL(8,4)) 
insert #pairseed select 1 

-- due to the 2 decimal points and distribution of the data we can find many n-tuples that sum to zero 
exeC#match 0, 0, 0 

select @maxbalancedpair = pairseed-1 from #pairseed 

declare @deviants table (id int) 
declare @most_deviant int 
while exists(select * from #data where id not in (select id from @deviants)) 
begin 
    select top 1 @most_deviant = id from #data where id not in (select id from @deviants) order by targetdistanceabsolute desc 
    insert @deviants select @most_deviant 
    exeC#match null, @most_deviant, @xrange 
end 

-- in general there would have to be some backtracking here 
-- now its a box-packing problem, but for simplicity just assign them round robin 
declare @output_group_pairs table (groupid int, pairid int) 
declare @groupidx int = 1 
declare @numgroups int = 3 
declare @pairid int 
select @pairid = pairseed-1 from #pairseed 

while @pairid >= 0 
begin 
    insert @output_group_pairs select @groupidx, @pairid 
    set @pairid = @pairid - 1 
    set @groupidx = (@groupidx % @numgroups) + 1 
end 

-- wimpy effort at redistributing the groups evenly 
-- todo: many cases will not work, should use a proper algorithm 

declare @maxiter int = 100 
declare @previouspairs table (pairid int) 
declare @previousgroups table (groupid int) 
while exists(select groupid from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid having COUNT(id) < @targetgroupsize) 
begin 
    set @maxiter = @maxiter-1 
    if @maxiter = 0 break 
    declare @groupid int = -1 
    declare @amountout int 
    select @groupid = groupid, @amountout = @targetgroupsize-COUNT(*) 
    from @output_group_pairs a join #pairsets b on a.pairid = b.pairid 
    where groupid not in (select groupid from @previousgroups) 
    group by groupid having COUNT(*) < @targetgroupsize 
    if @groupid = -1 break 

    declare @targetpair int = -1 

    select @targetpair = a.pairid from @output_group_pairs a 
    join (select pairid from #pairsets group by pairid having COUNT(*) <= @amountout) b on a.pairid = b.pairid 
    join (select groupid, count(id) groupcount from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid) group_counts on a.groupid = group_counts.groupid 
    where a.pairid not in (select pairid from @previouspairs) 
    order by abs(@amountout - groupcount) asc 

    if @targetpair = -1 
    begin 
    insert @previousgroups select @groupid 
    end 
    else 
    begin 
    insert @previouspairs select @targetpair 
    update @output_group_pairs set groupid = @groupid where pairid = @targetpair 
    end 
end 

set @maxiter = 100 
delete @previouspairs 
delete @previousgroups 
while exists(select groupid from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid having COUNT(id) > @targetgroupsize) 
begin 
    set @maxiter = @maxiter-1 
    if @maxiter = 0 break 
    set @groupid = -1 
    set @amountout = null 
    select @groupid = groupid, @amountout = COUNT(*)[email protected] 
    from @output_group_pairs a join #pairsets b on a.pairid = b.pairid 
    where groupid not in (select groupid from @previousgroups) 
    group by groupid having COUNT(*) > @targetgroupsize 
    if @groupid = -1 break 

    set @targetpair = -1 

    select @targetpair = a.pairid from @output_group_pairs a 
    join (select pairid from #pairsets group by pairid having COUNT(*) <= @amountout) b on a.pairid = b.pairid 
    join (select groupid, count(id) groupcount from @output_group_pairs a join #pairsets b on a.pairid = b.pairid group by groupid) group_counts on a.groupid = group_counts.groupid 
    where a.pairid not in (select pairid from @previouspairs) 
    order by abs(@amountout - groupcount) asc 

    if @targetpair = -1 
    begin 
    insert @previousgroups select @groupid 
    end 
    else 
    begin 
    insert @previouspairs select @targetpair 
    delete @output_group_pairs where pairid = @targetpair 
    end 
end 

-- output groups and their stats 
select GroupId, Id from @output_group_pairs a join #pairsets b on a.pairid = b.pairid order by 1, 2 

select a.GroupId, AVG(c.ratio) as [Average] , count(*) as [Count] 
from @output_group_pairs a 
join #pairsets b on a.pairid = b.pairid 
join t_stats c on b.id = c.id 
group by a.groupid 
go 

drop table #data 
drop table #pairsets 
drop table #pairseed 
drop proC#match 
+0

非常感謝...我印象深刻!如何更改您的腳本以指定每個組我想要的項目數量? (從't_stats'中的75個值中取得3組20個值) –

+0

另一件事:你已經設置了@ xmin = 0.75和@ xmax = 0.85,這給出了AVG(比率)約爲0.79和'COUNT(*)= 25'(完美)。如果我將這些值更改爲「@ xmin = 0.78」和「@ xmax = 0.80」,則會得到「AVG(ratio)= 0.90」和「COUNT(*)= 9672」(這是一個問題)。 –

+0

噢..你的例子是得到75箇中的30個組,所以我忽略了這個「n」是可變的要求,但它可以很容易地添加到包裝部分(這可能是最大的部分問題依然存在)。影響0.78至0.80情況的錯誤在第131至137行中,將它們註釋掉,並避免向多個組添加id。 –

0

哦,如果項目的一組中的數字是準確的要求,這是一個使用相同的詳盡匹配盒包裝的一部分,它要慢得多,雖然一個版本。

-- **Expects data in table t_stats (id, ratio)** 
if OBJECT_ID('tempdb..#data') is not null drop table #data 
if OBJECT_ID('tempdb..#data_pairs') is not null drop table #data_pairs 
if OBJECT_ID('tempdb..#pairsets') is not null drop table #pairsets 
if OBJECT_ID('tempdb..#pairseed') is not null drop table #pairseed 
if OBJECT_ID('tempdb..#match') is not null drop proC#match 
go 

-- rather horrible routine using dsql to find either: 
-- 1) groups of values that sum to exactly @targetsum (only if @targetsum non null) 
-- 2) the group containing the least values that includes data id @includeid and where the sum is within +- @targetsumrange 
create proC#match(@targetsum DECIMAL(8,4), @maxcombinations int, @includeid int, @targetsumrange DECIMAL(8,4)) as 
begin 
    set nocount on 
    declare @nearestmatch bit = 0 
    if @targetsum is null set @nearestmatch = 1 
    declare @combination table (value int, asstring varchar(10), alias varchar(50)) 
    declare @savedpairseed int = (select pairseed from #pairseed) 

    declare @stmtTemplate varchar(max) = 'declare @pairseed int = (select pairseed from #pairseed) 
    declare @DistSum DECIMAL(8,4) 
    <DeclareVars> 
    declare candloop cursor for select <SelectList>, <DistanceSum> as Dist_sum from <TableList> where <IdCheck> <SumCheck> 
    open candloop 
    fetch next from candloop into <VarsList>, @DistSum 
    while @@fetch_status = 0 
    begin 
    if (select count(*) from #data where id in (<VarsList>)) = <VarsCount> 
    begin 
     <DeleteData> 
     <InsertPairs> 
     set @pairseed = @pairseed + 1 
    end 
    fetch next from candloop into <VarsList>, @DistSum 
    end 
    close candloop 
    deallocate candloop 
    update #pairseed set pairseed = @pairseed ' 

    declare @combinations int = 1 
    while @combinations <= @maxcombinations 
    begin 
    insert @combination 
    select @combinations, cast(@combinations as varchar(10)), CHAR(ASCII('a')+ (@combinations-1)%26) + char(ascii('a') + @combinations-1) 
    declare @DeclareVars varchar(max) = '' 
    declare @SelectList varchar(max) = '' 
    declare @TableList varchar(max) = '' 
    declare @IdCheck varchar(max) = '' 
    declare @DistanceSum varchar(max) = '' 
    declare @InsertPairs varchar(max) = '' 
    declare @VarsList varchar(max) = '' 
    declare @SumCheck varchar(max) = '' 
    declare @DeleteData varchar(max) = 'delete #data where id in (<VarsList>)' 

    select @DeclareVars = @DeclareVars + 'declare @id'+asstring+ ' int ' from @combination 
    select @SelectList = @SelectList + alias +'.id, ' from @combination 
    set @SelectList = SUBSTRING(@selectlist, 1, LEN(@SelectList)-1) 
    select @TableList = @TableList + '#data '+alias+', ' from @combination 
    set @TableList = SUBSTRING(@TableList, 1, LEN(@TableList)-1) 
    select @IdCheck = @IdCheck + a.alias+'.id < '+b.alias+'.id and ' 
    from @combination a join @combination b on a.value+1 = b.value 
    if LEN(@IdCheck) > 4 
     set @IdCheck = SUBSTRING(@IdCheck, 1, LEN(@IdCheck)-4) + ' and ' 
    select @DistanceSum = @DistanceSum + alias+'.targetdistance + ' from @combination 
    set @DistanceSum = SUBSTRING(@DistanceSum, 1, LEN(@DistanceSum)-2) 
    select @VarsList = @VarsList + '@id'+asstring+ ', ' from @combination 
    set @VarsList = SUBSTRING(@VarsList, 1, LEN(@VarsList)-1) 
    select @InsertPairs = @InsertPairs + 'insert #pairsets select @pairseed, @id'+asstring+ ', @DistSum'+ CHAR(10) from @combination 
    set @SumCheck = @DistanceSum + ' = '+ cast(@Targetsum as varchar(20)) 

    if @nearestmatch = 1 
    begin 
     set @SumCheck = '(' 
     select @SumCheck = @SumCheck + alias+'.id = '+CAST(@includeid as varchar(10))+' or ' from @combination 
     if LEN(@SumCheck) > 4 
     set @SumCheck = SUBSTRING(@SumCheck, 1, LEN(@SumCheck)-3) 
     set @SumCheck = @SumCheck + ')' 
     set @DeleteData = '' 
    end 

    declare @stmt varchar(max) 
    set @stmt = REPLACE(@stmtTemplate, '<DeclareVars>', @DeclareVars) 
    set @stmt = REPLACE(@stmt, '<DeleteData>', @DeleteData) 
    set @stmt = REPLACE(@stmt, '<SelectList>', @SelectList) 
    set @stmt = REPLACE(@stmt, '<TableList>', @TableList) 
    set @stmt = REPLACE(@stmt, '<IdCheck>', @IdCheck) 
    set @stmt = REPLACE(@stmt, '<DistanceSum>', @DistanceSum) 
    set @stmt = REPLACE(@stmt, '<InsertPairs>', @InsertPairs) 
    set @stmt = REPLACE(@stmt, '<VarsList>', @VarsList) 
    set @stmt = REPLACE(@stmt, '<VarsCount>', cast(@combinations as varchar(10))) 
    set @stmt = REPLACE(@stmt, '<SumCheck>', @SumCheck) 

    exec (@stmt)  
    set @combinations = @combinations + 1 
    end 

    if @nearestmatch = 1 
    begin 
    -- above will have recorded all possible matches within range 
    -- remove all but the closest and reindex the pair ids 
    declare @bestmatch int 
    select top 1 @bestmatch = pairid from #pairsets where pairid >= @savedpairseed and ABS(distsum) < @targetsumrange 
    delete #pairsets where pairid >= @savedpairseed and pairid <> ISNULL(@bestmatch, -1) 
    delete #data where id in (select id from #pairsets where pairid = @bestmatch) 
    update #pairsets set pairid = @savedpairseed where pairid = @bestmatch 
    update #pairseed set pairseed = @savedpairseed+1 
    end 

end 

go 
set nocount on 
-- set the parameters 
declare @xmin DECIMAL(8,4) = 0.75 
declare @xmax DECIMAL(8,4) = 0.85 
declare @xrange DECIMAL(8,4) = @xmax - @xmin 
declare @xtarg DECIMAL(8,4) = (@[email protected])/2 
declare @ngroups int = 3 
declare @targetgroupsize int = 5 

declare @maxbalancedpair int 

-- copy the ratio data (using 75 row data from updated question) 
select *, [email protected] as targetdistance, abs(ratio - @xtarg) as targetdistanceabsolute into #data from t_stats 
create table #pairseed (pairseed int) 
create table #pairsets (pairid int, id int, distsum DECIMAL(8,4)) 
insert #pairseed select 1 

-- due to the 2 decimal points and distribution of the data we can find many n-tuples that sum to zero 
exeC#match 0, 8, 0, 0 

select @maxbalancedpair = pairseed-1 from #pairseed 

declare @deviants table (id int) 
declare @most_deviant int 
while exists(select * from #data where id not in (select id from @deviants)) 
begin 
    select top 1 @most_deviant = id from #data where id not in (select id from @deviants) order by targetdistanceabsolute desc 
    insert @deviants select @most_deviant 
    exeC#match null, 8, @most_deviant, @xrange 
end 

select * into #data_pairs from #pairsets 
delete #data 
delete #pairsets 
update #pairseed set pairseed = 1 
insert #data 
select pairid, COUNT(*), COUNT(*), COUNT(*) from #data_pairs group by pairid 

if (select SUM(ratio) from #data) < @targetgroupsize * @ngroups 
begin 
    raiserror('Cannot match - not enough data', 16, 1) 
    return 
end 

-- find the minimum number of matches that will reach targetgroupsize 
declare @maxmatches int = -1 
declare @matchcount int 
declare @matchsum int = 0 
declare maxmatchcount cursor for select CAST(ratio as int) from #data order by ratio asc 
open maxmatchcount 
fetch next from maxmatchcount into @matchcount 
while @@FETCH_STATUS = 0 and @matchsum <= @targetgroupsize 
begin 
    set @maxmatches = @maxmatches + 1 
    set @matchsum = @matchsum + @matchcount 
    fetch next from maxmatchcount into @matchcount 
end 
close maxmatchcount 
deallocate maxmatchcount 

exeC#match @targetgroupsize, @maxmatches, null, null 

declare @output_group_pairs table (groupid int, pairid int) 

insert @output_group_pairs select pairid, id from #pairsets where pairid <= @ngroups 

-- output groups and their stats 
select GroupId, Id from @output_group_pairs a join #data_pairs b on a.pairid = b.pairid order by 1, 2 

select a.GroupId, AVG(c.ratio) as [Average] , count(*) as [Count] 
from @output_group_pairs a 
join #data_pairs b on a.pairid = b.pairid 
join t_stats c on b.id = c.id 
group by a.groupid 

go 

drop table #data 
drop table #data_pairs 
drop table #pairsets 
drop table #pairseed 
drop proC#match 
+0

我想知道你的代碼是如何工作的......我用'0.75

+0

在最糟糕的情況下,由於暴力破壞組合,它會佔用大量內存。也許你應該運行所有建議的algortithms,並選擇X秒後的最佳答案... :) –