2017-06-29 130 views
1

我有2個數據集A和B.我想在A中添加一個變量(列),它從數據集B取值。例如,我的數據集A是:SAS:添加一個變量,從另一個數據集取值

Table A 
year return     
1990 4.5 
1991 6.2 
1992 3 
1993 9.9 


Table B 
year type    value 
1992 bond_threshold 10 
1992 stock_threshold 15 

和我期望的新數據集:

year return bond_threshold  stock_threshold 
1990 4.5      
1991 6.2     
1992 3  10     15 
1993 9.9     

我應該如何去嗎?我試圖合併,但它創造了1992年2觀察:

data want; 
merge A B; 
by year; 
run; 

並在此結果是:

year  return  type     value  
1990  4.5      
1991  6.2     
1992  3   bond_threshold  10     
1992  3   stock_threshold  15 
1993  9.9  
+0

請張貼到目前爲止你已經嘗試過什麼,什麼沒有奏效。 – Reeza

+0

@Reeza我會,但我不知道。我正在考慮使用數據步驟來合併 – duckman

+0

然後發佈您的嘗試代碼到目前爲止。下面是一個類似的例子,你可以通過:https://github.com/statgeek/SAS-Tutorials/blob/master/add_average_value_to_dataset – Reeza

回答

2

有兩個選擇,一個是使用你的數據集轉的寬格式PROC TRANSPOSE然後做你的合併。第二種是與數據集B合併兩次,每次都保留感興趣的變量。

根據你的問題的尺度,一個比另一個容易。

下面是第二個選項的示例。

data want; 
    merge a 
     b (where = (type='bond threshold')) 
     b (where = (type='stock threshold')); 

    by Year; 
run; 
+0

非常感謝。我已經嘗試了第一個選項,它的工作原理。你的第二個選項似乎也適用。 – duckman

2

我用不同的數據量做了一些性能分析。年數分別爲100.000,1.000.000,10.000.000和100.000.000。我還使用了一個%transpose宏(http://www.sascommunity.org/mwiki/images/b/be/BB-07-2013.sas),因爲它比proc transpose快。

enter image description here

每個步驟的CPU時間meassured。

結論:對於大型數據集,建議不要使用變體2.對於所有四個運行,最穩定的性能由變體2和3提供。對於非常大的數據集(表a中的> 100.000.000行)變體3表現更好,因爲merge將比proc sql更快。

這裏談到的轉置宏:

%macro transpose(libname_in=, 
        libname_out=, 
        data=, 
        out=, 
        by=, 
        prefix=, 
        var=, 
        autovars=, 
        id=, 
        descendingid=, 
        var_first=, 
        format=, 
        delimiter=, 
        copy=, 
        drop=, 
        sort=, 
        sort_options=, 
        use_varname=, 
        preloadfmt=, 
        guessingrows=, 
        newid=); 

    /*Check whether the data and out parameters contain one or two-level filenames*/ 
    %let lp=%sysfunc(findc(%superq(data),%str(%())); 
    %if &lp. %then %do; 
     %let rp=%sysfunc(findc(%superq(data),%str(%)),b)); 
     %let dsoptions=%qsysfunc(substrn(%nrstr(%superq(data)),&lp+1,&rp-&lp-1)); 
     %let data=%sysfunc(substrn(%nrstr(%superq(data)),1,%eval(&lp-1))); 
    %end; 
    %else %let dsoptions=; 
    %if %sysfunc(countw(&data.)) eq 2 %then %do; 
     %let libname_in=%scan(&data.,1); 
     %let data=%scan(&data.,2); 
    %end; 
    %else %if %length(&libname_in.) eq 0 %then %do; 
     %let libname_in=work; 
    %end; 

    %if %sysfunc(countw(&out.)) eq 2 %then %do; 
     %let libname_out=%scan(&out.,1); 
     %let out=%scan(&out.,2); 
    %end; 
    %else %if %length(&libname_out.) eq 0 %then %do; 
     %let libname_out=work; 
    %end; 

    %if %length(&newid.) eq 0 %then %do; 
     %let newid=row; 
    %end; 

    /*obtain last by variable*/ 
    %if %length(&by.) gt 0 %then %do; 
     %let lastby=%scan(&by.,-1); 
    %end; 
    %else %do; 
     %let lastby=; 
    %end; 

    /*Create macro variable to contain a list of variables to be copied*/ 
    %let to_copy=; 
    %if %length(&copy.) gt 0 %then %do; 
     data t_e_m_p; 
     set &libname_in..&data. (obs=1 keep=&copy.); 
     run; 

     proc sql noprint; 
     select name 
      into :to_copy separated by " " 
      from dictionary.columns 
       where libname="WORK" and 
        memname="T_E_M_P" 
      ; 
     quit; 
    %end; 

    /*Populate var parameter in the event it has a null value*/ 
    %if %length(&var.) eq 0 %then %do; 
     data t_e_m_p; 
     set &libname_in..&data. (obs=1 drop=&by. &id. &copy.); 
     run; 

     proc sql noprint; 
     select name 
      into :var separated by " " 
      from dictionary.columns 
       where libname="WORK" and 
        memname="T_E_M_P" 
      %if %sysfunc(upcase("&autovars.")) eq "CHAR" %then %do; 
        and type="char" 
      %end; 
      %else %if %sysfunc(upcase("&autovars.")) ne "ALL" %then %do; 
        and type="num" 
      %end; 
      ; 
     quit; 
    %end; 

    /*Initialize macro variables*/ 
    %let vars_char=; 
    %let varlist_char=; 
    %let vars_num=; 
    %let varlist_num=; 
    %let formats_char=; 
    %let format_char=; 
    %let formats_num=; 
    %let format_num=; 

    /*Create file t_e_m_p to contain one record with all var variables*/ 
    data t_e_m_p; 
     set &libname_in..&data. (obs=1 keep=&var.); 
    run; 

    /*Create macro variables containing untransposed var names and formats*/ 
    proc sql noprint; 
     select name, case 
        when missing(format) then " $"||strip(put(length,5.))||'.' 
        else strip(format) 
        end 
     into :vars_char separated by " ", 
      :formats_char separated by "~" 
      from dictionary.columns 
      where libname="WORK" and 
        memname="T_E_M_P" and 
        type="char" 
     ; 
     select name, case 
        when missing(format) then "best12." 
        else strip(format) 
        end 
     into :vars_num separated by " ", 
      :formats_num separated by "~" 
      from dictionary.columns 
      where libname="WORK" and 
        memname="T_E_M_P" and 
        type="num" 
     ; 
     select name 
     into :vars_all separated by " " 
      from dictionary.columns 
      where libname="WORK" and 
        memname="T_E_M_P" 
     ; 
    quit; 

    /*If sort parameter has a value of YES, create a sorted temporary data file*/ 
    %if %sysfunc(upcase("&sort.")) eq "YES" %then %do; 
     %let notsorted=; 
     proc sort data=&libname_in..&data. 
        (
        keep=&by. &id. &vars_char. &vars_num. &to_copy. 
        &dsoptions. 
       ) 
        out=t_e_m_p &sort_options. noequals; 
     by &by.; 
     run; 
     %let data=t_e_m_p; 
     %let libname_in=work; 
    %end; 
    %else %do; 
     %let notsorted=notsorted; 
    %end; 

    /*if no id parameter is present, create one from &newid.*/ 
    %if %length(&id.) eq 0 %then %do; 
     data t_e_m_p; 
     set &libname_in..&data.; 
     by &by.; 
     if first.&lastby then &newid.=1; 
     else &newid+1; 
     run; 
     %let id=&newid.; 
     %let data=t_e_m_p; 
     %let libname_in=work; 
    %end; 

    /*Ensure guessingrows parameter contains a value*/ 
    %if %length(&guessingrows.) eq 0 %then %do; 
     %let guessingrows=%sysfunc(constant(EXACTINT)); 
    %end; 

    /*Ensure a format is assigned to an id variable*/ 
    %if %length(&id.) gt 0 %then %do; 
     proc sql noprint; 
     select type,length,%sysfunc(strip(format)) 
      into :tr_macro_type, :tr_macro_len, :tr_macro_format 
      from dictionary.columns 
       where libname="%sysfunc(upcase(&libname_in.))" and 
        memname="%sysfunc(upcase(&data.))" and 
        upcase(name)="%sysfunc(upcase(&id.))" 
      ; 
     quit; 

     %if %length(&format.) eq 0 %then %do; 
     %let optsave=%sysfunc(getoption(missing),$quote.); 
     options missing=.; 
     %if %length(&tr_macro_format.) gt 0 %then %do; 
      %let format=&tr_macro_format.; 
     %end; 
     %else %if "&tr_macro_type." eq "num " %then %do; 
      %let format=%sysfunc(catt(best,&tr_macro_len.,%str(.))); 
     %end; 
     %else %do; 
      %let format=%sysfunc(catt($,&tr_macro_len.,%str(.))); 
     %end; 
     options missing=&optsave; 
     %end; 
    %end; 

    /*Create macro variables containing ordered lists of the requested transposed variable 
    names for character (varlist_char) and numeric (varlist_num) var variables */ 
    %if %length(&preloadfmt.) gt 0 %then %do; 
     %if %sysfunc(countw(&preloadfmt.)) eq 1 %then %do; 
     %let preloadfmt=&libname_in..&preloadfmt.; 
     %end; 
    %end; 
    %else %do; 
     %if %sysfunc(upcase("&sort.")) eq "YES" %then 
     %let dsoptions=; 
     proc freq data=&libname_in..&data. (obs=&guessingrows. keep=&id. &dsoptions.) 
     noprint; 
     tables &id./out=_for_format (keep=&id.); 
     run; 
     %if %sysfunc(upcase("&descendingid.")) eq "YES" %then %do; 
     proc sort data=_for_format; 
      by descending &id; 
     run; 
     %end; 
     data _for_format; 
     set _for_format; 
     order=_n_; 
     run; 
    %end; 

    proc sql noprint; 
    %do i=1 %to 2; 
     %if &i. eq 1 %then %let i_type=char; 
     %else %let i_type=num; 
     %if %length(&&vars_&i_type.) gt 0 %then %do; 
     select distinct 
     %do j=1 %to 2; 
      %if &j. eq 1 %then %let j_type=; 
      %else %let j_type=format; 
      %do k=1 %to %sysfunc(countw(&&vars_&i_type.)); 
      "&j_type. "||cats("&prefix.", 
      %if %sysfunc(upcase("&var_first.")) eq "NO" %then %do; 
       put(&id.,&format),"&delimiter." 
       %if %sysfunc(upcase("&use_varname.")) ne "NO" %then 
       ,scan("&&vars_&i_type.",&k.); 
      %end; 
      %else %do; 
       %if %sysfunc(upcase("&use_varname.")) ne "NO" %then 
       scan("&&vars_&i_type.",&k.),; 
       "&delimiter.",put(&id.,&format) 
      %end; 
      ) 
      %if &j. eq 2 %then 
       ||" "||cats(scan("&&formats_&i_type.",&k.,"~"),";"); 
      %if &k. lt %sysfunc(countw(&&vars_&i_type.)) %then ||; 
      %else ,; 
      %end; 
     %end; 
     %if "&tr_macro_type." eq "num " %then &id. format=best12.; 
      %else &id.; 
      ,order 
      into :varlist_&i_type. separated by " ", 
       :format_&i_type. separated by " ", 
       :idlist separated by " ", 
       :idorder separated by " " 
      %if %length(&preloadfmt.) gt 0 %then from &preloadfmt.; 
      %else from _for_format; 
       order by order 
     ; 
     %let num_numlabels=&sqlobs.; 
     %end; 
    %end; 
    quit; 

    proc sql noprint; 
     select distinct 
      %let j_type=; 
      %do k=1 %to %sysfunc(countw(&&vars_all.)); 
     "&j_type. "||cats("&prefix.", 

      %if %sysfunc(upcase("&var_first.")) eq "NO" %then %do; 
      put(&id.,&format),"&delimiter.", 
       %if %sysfunc(upcase("&use_varname.")) ne "NO" %then 
      scan("&&vars_all.",&k.); 
      ) 
      %end; 
      %else %do; 
       %if %sysfunc(upcase("&use_varname.")) ne "NO" %then 
      scan("&&vars_all.",&k.),; 
      "&delimiter.",put(&id.,&format)) 
      %end; 
      %if &k. lt %sysfunc(countw(&&vars_all.)) %then ||; 
      %else ,; 
      %end; 
      order 
      into :varlist_all separated by " ", 
       :idorder separated by " " 
      %if %length(&preloadfmt.) gt 0 %then from &preloadfmt.; 
      %else from _for_format; 
       order by order 
     ; 
    quit; 

    /*Create a format that will be used to assign values to the transposed variables*/ 
    data _for_format; 
     %if %length(&preloadfmt.) gt 0 %then set &preloadfmt. (rename=(&id.=start)); 
     %else set _for_format (rename=(&id.=start)); 
     ; 
     %if "&tr_macro_type." eq "num " %then retain fmtname "labelfmt" type "N"; 
     %else retain fmtname "$labelfmt" type "C"; 
     ; 
     label= 
     %if %length(&preloadfmt.) eq 0 %then _n_-1; 
     %else order-1; 
     ; 
    run; 

    proc format cntlin = _for_format; 
    run ; 

    /*Create and run the datastep that does the transposition*/ 
    data &libname_out..&out.; 
     set &libname_in..&data. (keep=&by. &id. 
     %do i=1 %to %sysfunc(countw("&vars_char.")); 
      %scan(&vars_char.,&i.) 
     %end; 
     %do i=1 %to %sysfunc(countw("&vars_num.")); 
      %scan(&vars_num.,&i.) 
     %end; 
     %do i=1 %to %sysfunc(countw("&to_copy.")); 
      %scan(&to_copy.,&i.) 
     %end; 
     &dsoptions. 
     ); 
     by &by. &notsorted.; 
     &format_char. &format_num. 
    %if %length(&vars_char.) gt 0 %then %do; 
     array want_char(*) $ 
     %do i=1 %to %eval(&num_numlabels.*%sysfunc(countw("&vars_char."))); 
     %scan(&varlist_char.,&i.) 
     %end; 
     ; 
     array have_char(*) $ &vars_char.; 
     retain want_char; 
     if first.&lastby. then call missing(of want_char(*)); 
     ___nchar=put(&id.,labelfmt.)*dim(have_char); 
     do ___i=1 to dim(have_char); 
     want_char(___nchar+___i)=have_char(___i); 
     end; 
    %end; 
    %if %length(&vars_num.) gt 0 %then %do; 
     array want_num(*) 
     %do i=1 %to %eval(&num_numlabels.*%sysfunc(countw("&vars_num."))); 
     %scan(&varlist_num.,&i.) 
     %end; 
     ; 
     array have_num(*) &vars_num.; 
     retain want_num; 
     if first.&lastby. then call missing(of want_num(*)); 
     ___nnum=put(&id.,labelfmt.)*dim(have_num); 
     do ___i=1 to dim(have_num); 
     want_num(___nnum+___i)=have_num(___i); 
     end; 
    %end; 
     drop &id. ___: &var. &drop.; 
     if last.&lastby. then output; 
    run; 

    data &libname_out..&out.; 
     retain &by. &to_copy. &varlist_all.; 
     set &libname_out..&out.; 
    run; 

    /*Delete all temporary files*/ 
    proc delete data=work.t_e_m_p work._for_format; 
    run; 

    %mend transpose; 

這裏談到的代碼性能測試:

/********************************************* 
     PERFORMANCE TEST: PREPARING DATA 
    *********************************************/ 
    data a; 
    do year=1 to 100000000; 
     return=4.5; 
     output; 
    end; 
    run; 

    data b; 
    length type $20; 
    do year=1 to 100000000; 
     type="bond_threshold"; 
     value=10; 
     output; 
     type="stock_threshold"; 
     value=10; 
     output; 
    end; 
    run; 

    %put ++++++++++ Variant 1 +++++++++++++++++++++++++++++++++++++++++++++++; 

    %transpose(data=b, out=b2, 
     by=year, var=value, 
     id=type 
); 

    proc sql noprint; 
    CREATE TABLE wanted AS 
    SELECT a.year 
     ,a.return 
     ,b2.bond_threshold 
     ,b2.stock_threshold 
    FROM a 
    LEFT JOIN b2 
    ON a.year=b2.year 
    ; 
    quit; 

    %put +++++++++++ Variant 2 ++++++++++++++++++++++++++++++++++++++++++++++; 

    proc sort data=a; 
    by year; 
    run; 
    proc sort data=b2; 
    by year; 
    run; 
    data want2; 
    merge a 
      b2; 
    by Year; 
    run; 

    %put ++++++++++ Variant 3 +++++++++++++++++++++++++++++++++++++++++++++++; 

    proc sort data=b; 
    by year; 
    run; 
    data want; 
    merge a 
      b (where = (type='bond_threshold')) 
      b (where = (type='stock_threshold')); 

    by Year; 
    run; 
相關問題