Fitting Cox Model Using PROC PHREG and Beyond in SAS

http://support.sas.com/resources/papers/proceedings09/236-2009.pdf

Cox PH 模型在倖存分析（survival analysis）被廣泛的使用。一篇發表在SAS GLOBAL FORUM 2009的技術文件解說了在 SAS 底下如何用 Cox PH 模型來計算一些重要估計量的方法，並且提供一個方便的巨集程式來簡化繁複的程式寫作和運行。

C-index
C-index原本是在ROC curve裡面用來解釋預測鑒別能力的測量指標，但後來在Cox PH model裡也可以應用。在倖存分析裡，C-index可以解釋為一個機率，是從某個事件(如：死亡)來的一個病人，他會比非該事件內的任何一個病人都擁有較高的機會會發生該事件。以下列程式來說明如何用 SAS 算出 C-index：

proc phreg data=sample;
  id idn;
  model combdays*combfv(0)=mihx diabhx lowef;
  output out=obs survival=surv;
run;

這個PROC PHREG程序並沒有什麼特別，使用者唯一需要額外做的就是利用一個ODS OUTPUT指令把個體的ID, 觀測到的倖存時間以及倖存函數估計值另存一個新檔(ods)。接著，用下面兩到程序去整理出一個concordance data。

proc sql;
      create table allset as
      select idn_j, y_j, x_j, idn as idn_i, surv as y_i, combdays as x_i
      from evtset, obs
      where idn_j<>idn;
quit;
data concord;
      set allset;
      if (x_iy_j) or (x_i>x_j and y_i

然後用下面這一串程式直接把C-index及其95%信賴區間該算出來：

data _null_;
    set concord end=eof;
    retain nch ndh;
    if _N_=1 then do;
      nch=0;
      ndh=0;
      end;
    if concord=1 then nch+1;
    if concord=0 then ndh+1;
    if eof=1 then do;
      call symput('ch',trim(left(nch)));
      call symput('dh',trim(left(ndh)));
      call symput('uspairs',trim(left(_n_)));
      end;
run;
data _null_;
     set sample end=eof;
     if eof=1 then call symput('totobs',trim(left(_n_)));
run;
%put &ch &dh &uspairs &totobs;
data calculat;
    ch=input("&ch",12.0);
    dh=input("&dh",12.0);
    uspairs=input("&uspairs",12.0);
    totobs=input("&totobs",10.0);
    pc=ch/(totobs*(totobs-1));
    pd=dh/(totobs*(totobs-1));
    c_hat=pc/(pc+pd);
    w=(2*1.96**2)/(totobs*(pc+pd));
    low_ci_w=((w+2*c_hat)/(2*(1+w)))-(sqrt((w**2+4*w*c_hat*(1-c_hat))/(2*(1+w))));
    upper_ci_w=((w+2*c_hat)/(2*(1+w)))+(sqrt((w**2+4*w*c_hat*(1-c_hat))/(2*(1+w))));
run;

結果如下圖所示：

Calculating adjusted survival rates using corrected group prognosis method
作者自製了一段巨集程式 %CGPM_adj 可將倖存機率以及倖存曲線圖直接輸出。其原始碼如下：

*****************************************************************************;
* Program: CGPM_adj.sas;
* Purpose: Calculate the unadjusted and adjusted survival rates and plot the
*survival curves using corrected group prognosis method;
*****************************************************************************;
*----------------------------------------------------------------------------;
* 1. All variables used in the model have to be binary variable.
*Categorical and numeric variables need to be re-coded as dummy
*variable to meet this requirement;
* 2. The maximum number of variables in the model is limited to 20;
*(not including the control variable). But this number can be
*easily expanded as needed.;
* 3. The analysis dataset for modeling was assign to be in SAS
*default library WORK. ;
*----------------------------------------------------------------------------;
%macro CGPM_adj(outputpath, figname, tbname, dsn, covars, ctrlvar, outcom, cnsrval=0, timevar, rlalpha=0.05);
*----------------------------------------------------------------------------;
* Calculate unadjusted survival rate;
*----------------------------------------------------------------------------;
*create dataset for controlled variable;
proc sql;
      create table ctrlset as
      select distinct &ctrlvar
      from &dsn
      order by &ctrlvar;
quit;
*get the unadjusted survival rates for controlled variable;
proc phreg data=&dsn noprint;
      model &timevar * &outcom (&cnsrval) = &ctrlvar;
      baseline covariates=ctrlset out=unadjset survival=survival / nomean;
run;

*----------------------------------------------------------------------------;
* Calculate adjusted survival rates using corrected group prognosis method;
*----------------------------------------------------------------------------;
*create identifier set for all possible combinations of covariates;
data _null_;
      cnt=1+count("&covars",' ');
      call symputx('varcnt', cnt);
run;
%put &varcnt;

%macro idx;
      %let i=1;
      %do %until(%scan(&covars,&i,' ')=%str());
      %local varnam;
            %let varnam=%scan(&covars,&i,' ');
            xp=&i+1;
            if &varnam=1 then substr(idx,xp,1)=1;
      %let i=%eval(&i+1);
      %end;
%mend;
%macro idxrv;
      %let i=1;
      %do %until(%scan(&covars,&i,' ')=%str());
      %local varnam;
            %let varnam=%scan(&covars,&i,' ');
            xp=&i+1;
            if substr(idx,xp,1)='1' then &varnam=1; else &varnam=0;
      %let i=%eval(&i+1);
      %end;
%mend;

data modelset;
      set &dsn;
      idx=left(put(10**(input("&varcnt",2.0)),21.));
      %idx;
      keep idx &ctrlvar &covars &timevar &outcom;
run;
proc freq data=modelset noprint;
      tables idx / out=idx;
run;
data popset (drop=percent xp);
      if _n_=1 then do until(last);
            set idx end=last;
            &ctrlvar=0;
            %idxrv;
            output;
            end;
      set idx;
            &ctrlvar=1;
            %idxrv;
            output;
run;
ods output censoredsummary=censum parameterEstimates=parasum;
proc phreg data=modelset;
      id idx;
      model &timevar * &outcom (&cnsrval) = &ctrlvar &covars / rl alpha=&rlalpha;
      baseline covariates=popset out=adjset0 survival=survival / nomean;
run;
ods output close;
proc sort data=popset out=count (keep=idx count);
by idx;
run;
proc sort data=adjset0;
by idx;
run;
data adjset1;
      merge adjset0 count;
      by idx;
run;
proc sort data=adjset1;
      by &ctrlvar &timevar;
run;
data adjset;
      set adjset1;
      by &ctrlvar &timevar;
            if first.&timevar then frqsum=0;
            frqsum+count;
            if first.&timevar then sursum=0;
            sursum+survival*count;
      if last.&timevar;
      adjsurv=sursum/frqsum;
      keep &ctrlvar &timevar adjsurv;
run;
data allsurv;
      length group $22;
      set adjset (in=a rename=(adjsurv=survival)) unadjset (in=b);
      if a=1 and &ctrlvar=0 then group='Adjusted'||' '||"&ctrlvar"||'=0';
     else if a=1 and &ctrlvar=1 then group='Adjusted'||' '||"&ctrlvar"||'=1';
      else if b=1 and &ctrlvar=0 then group='Unadjusted'||'
'||"&ctrlvar"||'=0';
      else if b=1 and &ctrlvar=1 then group='Unadjusted'||'
'||"&ctrlvar"||'=1';
      eventrate=1-survival;
run;
data summary;
      set allsurv;
      by group;
      if last.group;
run;
*----------------------------------------------------------------------------;
* You can change the code below to modify the title and graph axis scale etc.;
*----------------------------------------------------------------------------;
goption reset=all device=emf gsfname=grfa hsize=6.0 vsize=4 ftext='Arial/bo' htext=11pt display;
filename grfa "&outputpath.\&figname..emf";

symbol1 i=steplj v=none l=1 ci=green w=1;
symbol2 i=steplj v=none l=1 ci=black w=1;
symbol3 i=steplj v=none l=3 ci=green w=1;
symbol4 i=steplj v=none l=3 ci=black w=1;
axis1 offset=(0.5 cm,0.5 cm);

axis2 label=(angle=90 j=c 'Survival') order=(0.8 to 1 by 0.05) offset=(0.0cm,0.0 cm);
   legend1 across=2
   mode=share
   position=(bottom left inside)
   label=none
   value=(j=l h=10pt)
   offset=(0.5cm, 0cm);
proc gplot data=allsurv;
   plot survival * &timevar = group / legend=legend1 haxis=axis1 vaxis=axis2;
   label eventrate='Survival Rate' &timevar='Days After Randomization';
run;
quit;
title;
footnote;
ods rtf style=journal file="&outputpath.\&tbname..rtf" bodytitle
startpage=no;
proc print data=parasum noobs;
title "Analysis of Maximum Likelihood Estimates (alpha=&rlalpha for CI)";
run;
proc print data=summary noobs;
var group survival eventrate;
title 'Estimated survival and event rates at the time of last event observed';
run;
ods rtf close;
%mend;

巨集參數定義如下：

outputpath=報表與圖形輸出的路徑
figname=圖形檔名
tbname=報表檔名
dsn=存放在SAS裡面的資料檔名
covars=所有預測變數的名稱，不包含控制變數的名稱
ctrlvar=控制變數的名稱
outcom=輸出變數(事件)的名稱
cnsrval=設限的代碼，預設值為0
timevar=時間變數名稱
rlalpha=顯著水準alpha，預設值為0.05

承上面的範例，巨集程式執行的寫法如下：

%CGPM_adj(outputpath=L:\SGF2009\Examples, 
          figname=Adjdiab_fig, 
          tbname=adjdiab_summary, 
          dsn=sample, 
          covars=age65 lowef chfhx, 
          ctrlvar=diabhx, 
          outcom=deathfv, 
          cnsrval=0, 
          timevar=deathdays, 
          rlalpha=0.05);

結果如下：

Presenting the effect of a continuous covariate on estimated survival
當模型內包含連續型變數時，需要額外多兩個步驟來計算n-year倖存率。首先，用PROC PHREG程序把基底函數(baseline function)等一干相關的估計量另存成兩個新檔 yrout & sample(紅色部分)：

proc phreg data=sample;
model combdays*combfv(0)=age male;
baseline out=yrout lower=low_ci upper=up_ci survival=surv covariates=sample/alpha=0.05 nomean;
run;

再依照想要分類的變數(male)，去和倖存時間變數去排序(combdays)後再另存新檔(maxday)：

proc sort data=sample out=maxday (where=(combfv=1) keep=idn male combfv combdays);
      by male combdays;
run;

然後利用下面兩個資料步驟程序去計算男人和女人的5-year倖存率：

data _null_;
      set maxday;
      by male;
      if male=0 and last.male then call symput('lastday_f',combdays);
      if male=1 and last.male then call symput('lastday_m',combdays);
run;
%put &lastday_f &lastday_m;
data yrest;
      set yrout;
      evtrate=(1-surv)*100;
      evtlow=(1-low_ci)*100;
      evtup=(1-up_ci)*100;
      if combdays=&lastday_m and male=1 then output;
      if combdays=&lastday_f and male=0 then output;
run;

最後畫出的圖形如下所示：

(PS) 原文內並沒有附此圖的繪製程式。

Contact information
Lea Liu, Maryland Medical Research Institute
600 Wyndhurst Ave. Baltimore, MD 21210
(410) 435-4200, Email: lliu@mmri.org, Web: www.mmri.org

SUGI CLUB — SAS User Group International Club

公告

2011年10月14日星期五