2017-05-05 32 views


Obs group replicate  height  weight   bp cholesterol 

    1  1   A   0.406  0.887  0.262  0.683 
    2  1   B   0.656  0.700  0.083  0.836 
    3  1   C   0.645  0.711  0.349  0.383 
    4  1   D   0.115  0.266 666.000  0.015 
    5  2   A   0.607  0.247  0.644  0.915 
    6  2   B   0.172 333.000 555.000  0.924 
    7  2   C   0.680  0.417  0.269  0.499 
    8  2   D   0.787  0.260  0.610  0.142 
    9  3   A   0.406  0.099  0.263  111.000 
    10  3   B   0.981 444.000  0.971  0.894 
    11  3   C   0.436  0.502  0.563  0.580 
    12  3   D   0.814  0.959  0.829  0.245 
    13  4   A   0.488  0.273  0.463  0.784 
    14  4   B   0.141  0.117  0.674  0.103 
    15  4   C   0.152  0.935  0.250  0.800 
    16  4   D  222.000  0.247  0.778  0.941 

 Obs group replicate height weight  bp  cholesterol 

     1  1   A  0.4056 0.8870 0.2615  0.6827 
     2  1   B  0.6556 0.6995 0.0829  0.8356 
     3  1   C  0.6445 0.7110 0.3492  0.3826 
     4  1   D  0.1146 0.2655  .   0.0152 
     5  2   A  0.6072 0.2474 0.6444  0.9154 
     6  2   B  0.1720  .   .   0.9241 
     7  2   C  0.6800 0.4166 0.2686  0.4992 
     8  2   D  0.7874 0.2595 0.6099  0.1418 
     9  3   A  0.4057 0.0988 0.2632  . 
     10  3   B  0.9805  .  0.9712  0.8937 
     11  3   C  0.4358 0.5023 0.5626  0.5799 
     12  3   D  0.8138 0.9588 0.8293  0.2448 
     13  4   A  0.4881 0.2731 0.4633  0.7839 
     14  4   B  0.1413 0.1166 0.6743  0.1032 
     15  4   C  0.1522 0.9351 0.2504  0.8003 
     16  4   D   .  0.2465 0.7782  0.9412 


data have; 
    input group replicate $ height weight bp cholesterol; 
1 A 0.4056 0.8870 0.2615 0.6827 
1 B 0.6556 0.6995 0.0829 0.8356 
1 C 0.6445 0.7110 0.3492 0.3826 
1 D 0.1146 0.2655 666 0.0152 
2 A 0.6072 0.2474 0.6444 0.9154 
2 B 0.1720 333 555 0.9241 
2 C 0.6800 0.4166 0.2686 0.4992 
2 D 0.7874 0.2595 0.6099 0.1418 
3 A 0.4057 0.0988 0.2632 111 
3 B 0.9805 444 0.9712 0.8937 
3 C 0.4358 0.5023 0.5626 0.5799 
3 D 0.8138 0.9588 0.8293 0.2448 
4 A 0.4881 0.2731 0.4633 0.7839 
4 B 0.1413 0.1166 0.6743 0.1032 
4 C 0.1522 0.9351 0.2504 0.8003 
4 D 222 0.2465 0.7782 0.9412 

data outliers; 
    input parameter $ 11. group replicate $ measurement; 
cholesterol 3 A 111 
height  4 D 222 
weight  2 B 333 
weight  3 B 444 
bp   2 B 555 
bp   1 D 666 


data want; 
    set have; 

    if group = 3 and replicate = 'A' and cholesterol = 111 then cholesterol = .; 
    if group = 4 and replicate = 'D' and height  = 222 then height  = .; 
    if group = 2 and replicate = 'B' and weight  = 333 then weight  = .; 
    if group = 3 and replicate = 'B' and weight  = 444 then weight  = .; 
    if group = 2 and replicate = 'B' and bp   = 555 then bp   = .; 
    if group = 1 and replicate = 'D' and bp   = 666 then bp   = .; 




檢查sas更新聲明 – DCR


您是否有每行的唯一標識符? – Reeza


在您的示例中,超過100的任何值都是異常值。這是你的規則嗎?將這些級別加載到臨時數組中然後使用數組來循環並將其分配爲缺失可能更容易。 – Reeza





  • 你可以使用一個數組像array params{*} height -- cholesterol;,然後使用vname功能爲您遍歷數組來比較的parameter變量的值,但是這個被你的情況複雜的,因爲你有一個一對多的合併,所以你將不得不保留替換,並且只輸出每個組的最後一條記錄......所以它變得複雜。
  • 您可以使用proc transpose轉置異常值數據,但這會變得冗長,因爲您需要每個parameter都需要轉置,然後您需要將所有轉置數據集合並回have數據集。我用這種方法的主要問題是帶有大量轉置的代碼變得笨拙。
  • 您所創建的宏變量邏輯可能過多。但相對於獲得parameter變量的值與在have數據集中的變量名匹配的其他方式,我不認爲這樣的事情是過度:

    data _null_; 
        set outliers; 
        call symput("outlierstatement"||_n_,"if group = "||group||" and replicate = '"||replicate||"' and "||parameter||" = "||measurement||" then "|| parameter ||" = .;"); 
        call symput("outliercount",_n_); 
    %macro makewant(); 
        data want; 
         set have; 
         %do i = 1 %to &outliercount; 





  1. 移調濾波數據到相同的形式要數據,把它過濾^
  2. 合併想要和由記錄鍵過濾^(它是由組羣和複製的)
  3. 陣列過程中的數據元素,尋找過濾條件。


data have; 
    input group replicate $ height weight bp cholesterol; 
1 A 0.4056 0.8870 0.2615 0.6827 
1 B 0.6556 0.6995 0.0829 0.8356 
1 C 0.6445 0.7110 0.3492 0.3826 
1 D 0.1146 0.2655 666 0.0152 
2 A 0.6072 0.2474 0.6444 0.9154 
2 B 0.1720 333 555 0.9241 
2 C 0.6800 0.4166 0.2686 0.4992 
2 D 0.7874 0.2595 0.6099 0.1418 
3 A 0.4057 0.0988 0.2632 111 
3 B 0.9805 444 0.9712 0.8937 
3 C 0.4358 0.5023 0.5626 0.5799 
3 D 0.8138 0.9588 0.8293 0.2448 
4 A 0.4881 0.2731 0.4633 0.7839 
4 B 0.1413 0.1166 0.6743 0.1032 
4 C 0.1522 0.9351 0.2504 0.8003 
4 D 222 0.2465 0.7782 0.9412 
5 E 222 0.2465 0.7782 0.9412 /* test record for filter value misalignment test */ 

data outliers; 
    length parameter $32; %* <--- widened parameter so it can transposed into column via id; 
    input parameter $ group replicate $ measurement ; %* <--- changed measurement to numeric variable; 
cholesterol 3 A 111 
height  4 D 222 
height  5 E 223 /* test record for filter value misalignment test */ 
weight  2 B 333 
weight  3 B 444 
bp   2 B 555 
bp   1 D 666 

data want; 
    set have; 

    if group = 3 and replicate = 'A' and cholesterol = 111 then cholesterol = .; 
    if group = 4 and replicate = 'D' and height  = 222 then height  = .; 
    if group = 2 and replicate = 'B' and weight  = 333 then weight  = .; 
    if group = 3 and replicate = 'B' and weight  = 444 then weight  = .; 
    if group = 2 and replicate = 'B' and bp   = 555 then bp   = .; 
    if group = 1 and replicate = 'D' and bp   = 666 then bp   = .; 

/* Create a view with 1st row having all the filtered parameters 
* This is necessary so that the first transposed filter row 
* will have the parameters as columns in alphabetic order; 

proc sql noprint; 
    create view outliers_transpose_ready as 
    select distinct parameter from outliers 
    select * from outliers 
    order by group, replicate, parameter 

    /* Generate a alphabetic ordered list of parameters for use 
    * as a variable (aka column) list in the filter application step */ 
    select distinct parameter 
    into :parameters separated by ' ' 
    from outliers 
    order by parameter 

%put NOTE: &=parameters; 

/* tranpose the filter data 
* The ID statement pivots row data into column names. 
* The prefix=_filter_ ensure the new column names 
* will not collide with the original data, and can be 
* the shortcut listed with _filter_: in an array statement. 

proc transpose data=outliers_transpose_ready out=outliers_apply_ready prefix=_filter_; 
    by group replicate notsorted; 
    id parameter; 
    var measurement; 

/* Robust production code should contain a bin for 
* data that does not conform to the filter application conditions 

    want2(label="Outlier filtering applied" drop=_i_ _filter_:) 
    want2_warnings(label="Outlier filtering: misaligned values") 
    merge have outliers_apply_ready(keep=group replicate _filter_:); 
    by group replicate; 

    /* The arrays are for like named columns 
    * due to the alphabetic ordering enforced in data and codegen preparation 
    array value_filter_check _filter_:; 
    array value &parameters; 

    if group ne .; 

    do _i_ = 1 to dim(value); 

    if value(_i_) EQ value_filter_check(_i_) then 
     value(_i_) = .; 
    if not missing(value_filter_check(_i_)) AND 
     value(_i_) NE value_filter_check(_i_) 
    then do; 
     put 'WARNING: Filtering expected but values do not match. ' group= replicate= value(_i_)= value_filter_check(_i_)=; 
     output want2_warnings; 

    output want2; 


proc compare noprint data=want compare=want2 outnoequal out=diffs; 
    by group replicate; 




data want; 
    if 0 then set outliers (keep=parameter group replicate); 
    if _N_ = 1 then 
     declare hash h(dataset:'outliers') ; 
     h.defineKey('parameter', 'group', 'replicate') ; 
     h.defineDone() ; 
    set have ; 

    array vars {*} height weight bp cholesterol ; 

    do i=1 to dim(vars); 
    if h.check()=0 then call missing(vars{i}); 

    drop i parameter; 

我不認爲'if 0 then'聲明是必要的,因爲你在散列聲明語句中引用'dataset:outliers',但我可能會誤解。使用散列很難確定,因爲文檔本身包含錯誤。 –


在執行defineDone()之前,哈希表中的變量需要位於PDV中。如果你註釋掉'if 0 then'就會出錯。即使在指定數據集時,DECLARE語句也不能添加變量PDV。大多數SAS文檔使用長度語句向PDV添加變量,但我認爲'如果0則'更容易。 – Quentin


我無法重現您提到的錯誤。我觀察到的行爲是相同的或不存在「if 0 then」。我的理解是'declare hash'使用'dataset:'參數來實例化。也就是說,'dataset:'參數中提供的數據集用於定義'defineKey'語句中列出的鍵。 https://support.sas.com/documentation/cdl/en/lecompobjref/69740/HTML/default/viewer.htm#p00ilfw5pzcjvtn1nfya9863fozd.htm#p04u7k0pr34yxjn13idjdtdpp8o4 –



You could use an array like array params{*} height -- cholesterol; and then use the vname function as you loop through the array to compare to the value in the parameter variable, but this gets complicated in your case because you have a one to many merge, so you would have to retain the replacements and only output the last record for each by group... so it gets complicated.



proc sort data=outliers; 
    by group replicate; 

data want (keep=group replicate height weight bp cholesterol); 
    merge have (in=a) 
     outliers (keep=group replicate parameter in=b) 
    by group replicate; 

    array vars {*} height weight bp cholesterol ; 

    do i=1 to dim(vars); 
    if vname(vars{i})=parameter then call missing(vars{i}); 

    if last.replicate; 

我不清楚你爲什麼決定使用'in ='?你能澄清嗎? –


子女只是習慣。我會經常添加類似'if a = 0,然後放置'ERROR:'(group replicate)(=);'以防止異常數據集中的任何意外。 – Quentin


良好的通話。就我而言,「異常值」數據集來自同事。我沒有考慮異常值創建是自動的並且可能返回一個空白數據集的情況。 –



如果你渴望擁有包含if語句(也許是驗證)的數據集,只需取下into :聲明並放置一個create table statements as線在PROC SQL步驟開始。

data have; 
    input group replicate $ height weight bp cholesterol; 
1 A 0.4056 0.8870 0.2615 0.6827 
1 B 0.6556 0.6995 0.0829 0.8356 
1 C 0.6445 0.7110 0.3492 0.3826 
1 D 0.1146 0.2655 666 0.0152 
2 A 0.6072 0.2474 0.6444 0.9154 
2 B 0.1720 333 555 0.9241 
2 C 0.6800 0.4166 0.2686 0.4992 
2 D 0.7874 0.2595 0.6099 0.1418 
3 A 0.4057 0.0988 0.2632 111 
3 B 0.9805 444 0.9712 0.8937 
3 C 0.4358 0.5023 0.5626 0.5799 
3 D 0.8138 0.9588 0.8293 0.2448 
4 A 0.4881 0.2731 0.4633 0.7839 
4 B 0.1413 0.1166 0.6743 0.1032 
4 C 0.1522 0.9351 0.2504 0.8003 
4 D 222 0.2465 0.7782 0.9412 

data outliers; 
    input parameter $ 11. group replicate $ measurement; 
cholesterol 3 A 111 
height  4 D 222 
weight  2 B 333 
weight  3 B 444 
bp   2 B 555 
bp   1 D 666 

proc sql noprint; 
    cat('if group = ' 
     , strip(put(group, best32.)) 
     , " and replicate = '" 
     , strip(replicate) 
     , "' and " 
     , strip(parameter) 
     , ' = ' 
     , strip(put(measurement, best32.)) 
     , ' then ' 
     , strip(parameter) 
     , ' = . ;') 
    into : listIfs separated by ' ' 
    from outliers 

%put %quote(&listIfs); 

data want; 
    set have; 