2014-09-24 27 views
2

Supose組中,我們已經得到了以下數據集:如何使用兩個變量

DATE VAR1 VAR2 
1  A  1 
2  A  1 
3  B  1 
4  C  2 
5  D  3 
6  E  4 
7  F  5 
8  B  6 
9  B  7 
10 D  1 

每個記錄屬於一個人,問題是,一個人可以有不同的值多條記錄。

識別一個人:如果你共享相同的VAR1,你是同一個人,但是如果你共享相同的VAR2,你就是同一個人。

我的目標是創建一個新的變量IDPERSON,它唯一地標識每個記錄的人。在我的示例中,只有4個不同的人:

DATE VAR1 VAR2 IDPERSON 
1  A  1  1 
2  A  1  1 
3  B  1  1 
4  C  2  2 
5  D  3  1 
6  E  4  3 
7  F  5  4 
8  B  6  1 
9  B  7  1 
10  D  1  1 

如何通過使用SQL或SAS實現此目的?

+0

你能對齊你的數據嗎?現在它很難讀。 – 2014-09-24 09:01:20

+0

你不能用直接查詢來做到這一點,因爲任何兩行之間可以有任意數量的「跳躍」。 – Eevee 2014-09-24 09:03:45

+0

這裏存在某種圓形效應..這樣做很難.. 即使您只比較一列,也會使用PSQL – masum7 2014-09-24 09:31:04

回答

0

我忘了後我最終的解決方案,它是一個宏觀的SAS。我已經做了3個變量的另一個。

%MACRO GROUPER2(INDATA,OUTDATA,ID1,ID2,IDOUT,IDN=_N_,MAXN=5); 
    %PUT ****************************************************************; 
    %PUT ****************************************************************; 
    %PUT **** GROUPER MACRO; 
    %PUT **** PARAMETERS:; 
    %PUT ****  INPUT DATA: &INDATA.; 
    %PUT ****  OUTPUT DATA: &OUTDATA.; 
    %PUT ****  FIRST VARIABLE: &ID1.; 
    %PUT ****  SECOND VARIABLE: &ID2.; 
    %PUT ****  OUTPUT GROUPING VARIABLE: &IDOUT.; 
    %IF (&IDN.=_N_) %THEN %PUT ****  STARTING NUMBER VARIABLE: AUTONUMBER; 
    %ELSE %PUT ****  STARTING NUMBER VARIABLE: &IDN.; 
    %PUT ****  MAX ITERATIONS: &MAXN.; 
    %PUT ****************************************************************; 
    %PUT ****************************************************************; 

    /* CREATE FIRST GUESS FOR GROUP ID */ 
    DATA _G_TEMP1 _G_TEMP2; 
     SET &INDATA.; 
     &IDOUT.=&IDN.; 
     IF &IDOUT.=. THEN OUTPUT _G_TEMP2; 
     ELSE OUTPUT _G_TEMP1; 
    RUN; 
    PROC SQL NOPRINT; 
     SELECT MAX(&IDOUT.) INTO :MAXIDOUT FROM _G_TEMP1; 
    QUIT; 
    DATA _G_TEMP2; 
     SET _G_TEMP2; 
     &IDOUT.=_N_+&MAXIDOUT.; 
    RUN; 
    DATA _G_TEMP; 
     SET _G_TEMP1 _G_TEMP2; 
    RUN; 

    PROC SQL; 
     UPDATE _G_TEMP SET &IDOUT.=. WHERE &ID1. IS NULL AND &ID2. IS NULL; 
    QUIT; 

    /* LOOP, IMPROVE GROUP ID EACH TIME*/ 
    %LET I = 1; 
    %DO %WHILE (&I. <= &MAXN.); 
     %PUT LOOP NUMBER &I.; 
     %LET I = %EVAL(&I. + 1); 

     PROC SQL NOPRINT; 
      /* FIND THE LOWEST GROUP ID FOR EACH GROUP OF FIRST VARIABLE */ 
      CREATE TABLE _G_MAP1 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID1. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1.; 
      /* FIND THE LOWEST GROUP ID FOR EACH GROUP OF SECOND VARIABLE */ 
      CREATE TABLE _G_MAP2 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID2. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2.; 
      /* FIND THE LOWEST GROUP ID FROM BOTH GROUPING VARIABLES */ 
      CREATE TABLE _G_NEW AS SELECT A.&ID1., A.&ID2., COALESCE(MIN(B.&IDOUT., C.&IDOUT.), A.&IDOUT.) AS &IDOUT., 
       A.&IDOUT. AS &IDOUT._OLD FROM _G_TEMP AS A FULL OUTER JOIN _G_MAP1 AS B ON A.&ID1. = B.&ID1. 
       FULL OUTER JOIN _G_MAP2 AS C ON A.&ID2. = C.&ID2.; 
      /* PUT RESULTS INTO TEMPORARY DATASET READY FOR NEXT ITTERATION */ 
      CREATE TABLE _G_TEMP AS SELECT * FROM _G_NEW ORDER BY &ID1., &ID2.; 
      /* CHECK IF THE ITTERATION PROVIDED ANY IMPROVEMENT */ 
      SELECT MIN(CASE WHEN &IDOUT._OLD = &IDOUT. THEN 1 ELSE 0 END) INTO :STOPFLAG FROM _G_TEMP; 
      %PUT NO IMPROVEMENT? &STOPFLAG.; 
     QUIT; 
     /* END LOOP IF ID UNCHANGED OVER LAST ITTERATION */ 
     %LET ITERATIONS=%EVAL(&I. - 1); 
     %IF &STOPFLAG. %THEN %LET I = %EVAL(&MAXN. + 1); 
    %END; 

    %PUT ****************************************************************; 
    %PUT ****************************************************************; 
    %IF &STOPFLAG. %THEN %PUT **** LOOPING ENDED BY NO-IMPROVEMENT CRITERIA. OUTPUT FULLY GROUPED.; 
    %ELSE %PUT **** WARNING: LOOPING ENDED BY REACHING THE MAXIMUM NUMBER OF ITERARIONS. OUTPUT NOT FULLY GROUPED.; 
    %PUT **** NUMBER OF ITERATIONS: &ITERATIONS. (MAX: &MAXN.); 
    %PUT ****************************************************************; 
    %PUT ****************************************************************; 

    DATA &OUTDATA.; 
     SET _G_TEMP; 
     DROP &IDOUT._OLD; 
    RUN; 
    /* OUTPUT LOOKUP TABLE */ 
    PROC SQL; 
     CREATE TABLE &OUTDATA._1 AS SELECT &ID1., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1. ORDER BY &ID1.; 
     CREATE TABLE &OUTDATA._2 AS SELECT &ID2., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2. ORDER BY &ID2.; 
    QUIT; 

    /* CLEAN UP */ 
    PROC DATASETS NOLIST; 
     DELETE _G_:; 
    QUIT; 
%MEND GROUPER2; 
0

您認爲這樣會起作用嗎?

它用SAS編寫,但它使用SQL語句。

DATA TEMP3; 
    INPUT VAR1 VAR2 $ DATE; 
CARDS; 
1 A 1 
1 A 2 
1 B 3 
2 C 4 
3 D 5 
4 E 6 
5 F 7 
6 B 8 
7 B 9 
1 D 10 
    ; 
RUN; 

PROC SQL; 
    CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 ORDER BY VAR2, VAR1; 
    CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 ORDER BY VAR1, VAR2; 
    CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.VAR2, TEMP4.VAR1, TEMP5.VAR2 AS VAR22 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR1=TEMP5.VAR1); 
    CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP5.VAR1 AS VAR12 FROM WORK.TEMP6 INNER JOIN WORK.TEMP5 ON (TEMP6.VAR2=TEMP5.VAR2); 
    CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT VAR22, VAR12 FROM WORK.TEMP7 ORDER BY VAR22, VAR12; 
    CREATE TABLE WORK.TEMP9 AS SELECT VAR22, MAX(VAR12) AS VAR12 FROM WORK.TEMP8 GROUP BY VAR22; 
    CREATE TABLE WORK.TEMP10 AS SELECT TEMP8.* FROM WORK.TEMP8 INNER JOIN WORK.TEMP9 ON (TEMP8.VAR22=TEMP9.VAR22 AND TEMP8.VAR12=TEMP9.VAR12); 
    CREATE TABLE WORK.TEMP11 AS SELECT TEMP3.*, TEMP10.VAR12 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP10 ON (TEMP3.VAR2=TEMP10.VAR22); 
QUIT; 
+0

我仍然有這個代碼的麻煩,在一些特殊情況下,它不會工作。 – 2014-09-24 11:15:59

0

我已經將這個問題分解成了幾個步驟,它適用於您提供的數據。可能有一種方法可以減少步驟的數量,但代價是可讀性。讓我知道這是否適用於您的真實數據。

/* create input dataset */ 
data have; 
input DATE VAR1 $ VAR2; 
datalines; 
1  A  1 
2  A  1 
3  B  1 
4  C  2 
5  D  3 
6  E  4 
7  F  5 
8  B  6 
9  B  7 
10 D  1 
; 
run; 

/* calculate min VAR2 per VAR1 */ 
proc summary data=have nway idmin; 
class var1; 
output out=minvar2 (drop=_:) min(var2)=temp_var; 
run; 

/* add in min VAR2 data */ 
proc sql; 
create table temp1 as select 
a.*, 
b.temp_var 
from have as a 
inner join 
minvar2 as b 
on a.var1 = b.var1 
order by b.temp_var; 
quit; 

/* create idperson variable */ 
data want; 
set temp1; 
by temp_var; 
if first.temp_var then idperson+1; 
drop temp_var; 
run; 

/* sort back to original order */ 
proc sort data=want; 
by date var1; 
run; 
+0

謝謝你,我會探索你的解決方案。我自己找到了另一種解決方案,但它非常複雜。另外,我的數據庫也比較複雜,因爲它缺少了Var1和Var2的數據,我不得不獨立處理其他數據。 – 2014-09-25 06:30:23

0

基思:

您的解決方案不能正常工作,看看下面的數據集:

DATA TEMP3; 
INPUT VAR2 VAR1 $ DATE; 
DUMMY=1; 
CARDS; 
1 A 1 
1 A 2 
1 B 3 
2 C 4 
3 D 5 
4 E 6 
5 F 7 
6 B 8 
7 B 9 
1 D 10 
1 X 11 
7 G 14 
6 Y 15 
6 D 16 
6 I 18 
8 D 20 
9 Z 21 
9 X 22 
; 
RUN; 

你的程序的結果是:

VAR2 VAR1 DATE DUMMY idperson 
1 A 1 1 1 
1 A 2 1 1 
1 B 3 1 1 
2 C 4 1 2 
3 D 5 1 1 
4 E 6 1 3 
5 F 7 1 4 
6 B 8 1 1 
7 B 9 1 1 
1 D 10 1 1 
1 X 11 1 1 
7 G 14 1 6 
6 Y 15 1 5 
6 D 16 1 1 
6 I 18 1 5 
8 D 20 1 1 
9 Z 21 1 7 
9 X 22 1 1 

哪些是不因爲Var1 = 6條記錄有兩個不同的ID。

這就是我所做的,整個程序(這裏沒有發佈)比較複雜(而且沒那麼優雅),因爲它處理Var1和Var2中缺失的數據。

PROC SQL; 
    CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR1, VAR2; 
    CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR2, VAR1; 
    CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.*, TEMP5.VAR1 AS CIP2 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR2=TEMP5.VAR2); 
    CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP4.VAR2 AS IDHH2 FROM WORK.TEMP6 INNER JOIN WORK.TEMP4 ON (TEMP6.VAR1=TEMP4.VAR1); 
    CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT IDHH2, CIP2 FROM WORK.TEMP7; 
    CREATE TABLE WORK.TEMP9 AS SELECT TEMP7.*, TEMP8.CIP2 AS CIP3 FROM WORK.TEMP7 INNER JOIN WORK.TEMP8 ON (TEMP7.IDHH2=TEMP8.IDHH2); 
    CREATE TABLE WORK.TEMP10 AS SELECT TEMP9.*, TEMP8.IDHH2 AS IDHH3 FROM WORK.TEMP9 INNER JOIN WORK.TEMP8 ON (TEMP9.CIP3=TEMP8.CIP2); 
    CREATE TABLE WORK.TEMP11 AS SELECT DISTINCT VAR1, IDHH3 AS VAR2 FROM WORK.TEMP10 ORDER BY VAR1, IDHH3; 
    CREATE TABLE WORK.TEMP12 AS SELECT VAR1, MAX(VAR2) AS VAR2 FROM WORK.TEMP11 GROUP BY VAR1; 
    CREATE TABLE WORK.TEMP13 AS SELECT TEMP11.* FROM WORK.TEMP11 INNER JOIN WORK.TEMP12 ON (TEMP11.VAR1=TEMP12.VAR1 AND TEMP11.VAR2=TEMP12.VAR2); 
    CREATE TABLE WORK.TEMP14 AS SELECT TEMP3.*, TEMP13.VAR2 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP13 ON (TEMP3.VAR1=TEMP13.VAR1); 
    CREATE TABLE WORK.TEMP15 AS SELECT DISTINCT VAR2, IDPERSONA FROM WORK.TEMP14 WHERE VAR2^=. AND IDPERSONA^=.; 
    CREATE TABLE WORK.TEMP16 AS SELECT TEMP14.*, TEMP15.IDPERSONA AS IDPERSONA2 FROM WORK.TEMP14 LEFT JOIN WORK.TEMP15 ON (TEMP14.VAR2=TEMP15.VAR2) ORDER BY DATE; 
QUIT; 

DATA TEMP16; 
    SET TEMP16; 
    IF IDPERSONA=. THEN IDPERSONA=IDPERSONA2; 
    DROP IDPERSONA2; 
RUN; 

而且正確的結果:

VAR2 VAR1 DATE DUMMY IDPERSONA 
1 A 1 1 9 
1 A 2 1 9 
1 B 3 1 9 
2 C 4 1 2 
3 D 5 1 9 
4 E 6 1 4 
5 F 7 1 5 
6 B 8 1 9 
7 B 9 1 9 
1 D 10 1 9 
1 X 11 1 9 
7 G 14 1 9 
6 Y 15 1 9 
6 D 16 1 9 
6 I 18 1 9 
8 D 20 1 9 
9 Z 21 1 9 
9 X 22 1 9 
1
%macro grouper(
    inData /*Input dataset*/, 
    outData /*output dataset*/, 
    id1 /*First identification variable (must be numeric)*/, 
    id2 /*Second identification variable*/, 
    idOut /*Name of variable to contain group ID*/, 
    maxN = 5 /*Max number of itterations in case of failure*/); 
    /* Assign an ID to each distict connected graph in a a network */ 

    /* Create first guess for group ID */ 
    data _g_temp; 
     set &inData.; 
     &idOut. = &id1.; 
    run; 

    /* Loop, improve group ID each time*/ 
    %let i = 1; 
    %do %while (&i. <= &maxN.); 
     %put Loop number &i.; 
     %let i = %eval(&i. + 1); 

     proc sql noprint; 
      /* Find the lowest group ID for each group of first variable */ 
      create table _g_map1 as 
      select 
       min(&idOut.) as &idOut., 
       &id1. 
      from _g_temp 
      group by &id1.; 

      /* Find the lowest group ID for each group of second variable */ 
      create table _g_map2 as 
      select 
       min(&idOut.) as &idOut., 
       &id2. 
      from _g_temp 
      group by &id2.; 

      /* Find the lowest group ID from both grouping variables */ 
      create table _g_new as 
      select 
       a.&id1., 
       a.&id2., 
       coalesce(min(b.&idOut., c.&idOut.), a.&idOut.) as &idOut., 
       a.&idOut. as &idOut._old 
      from _g_temp as a 
      full outer join _g_map1 as b 
       on a.&id1. = b.&id1. 
      full outer join _g_map2 as c 
       on a.&id2. = c.&id2.; 

      /* Put results into temporary dataset ready for next itteration */ 
      create table _g_temp as 
      select * 
      from _g_new; 

      /* Check if the itteration provided any improvement */ 
      select 
       min(
        case when &idOut._old = &idOut. then 1 
        else 0 
       end) into :stopFlag 
      from _g_temp; 
     quit; 

     /* End loop if ID unchanged over last itteration */ 
     %if &stopFlag. %then %let i = %eval(&maxN. + 1); 
    %end; 

    /* Output lookup table */ 
    proc sql; 
     create table &outData. as 
     select 
      &id1., 
      min(&idOut.) as &idOut. 
     from _g_temp 
     group by &id1.; 
    quit; 

    /* Clean up */ 
    proc datasets nolist; 
     delete _g_:; 
    quit; 
%mend grouper; 


DATA baseData; 
INPUT VAR1 VAR2 $; 
CARDS; 
1 A 
1 A 
1 B 
2 C 
3 D 
4 E 
5 F 
6 B 
7 B 
1 D 
1 X 
7 G 
6 Y 
6 D 
6 I 
8 D 
9 Z 
9 X 
; 
RUN; 

%grouper(
    baseData, 
    outData, 
    VAR1, 
    VAR2, 
    groupID); 
+0

看起來像一個小樣本,它使迭代每次都添加新的信息,在我的程序中我只執行(手動)兩次迭代。改進對Var1和Var2缺失值的宏添加支持很容易嗎?我的意思是Var1 =。和Var2 =「A」被添加到「var2 = A」組中,問題在於您的輸出是基於Var1 varible的組的定義,如果缺失則事情變得複雜。也許產生兩個輸出,一個用於Var1 + idgroup,另一個用於Var2 + idgroup,可以解決這個問題。非常感謝您的幫助 – 2014-09-25 14:01:21