## Bootstrapping Gini Coefficients

DATA BOOT SKEW; /* read the data and create two data sets: BOOT contains the original data and all the bootstrap samples, SKEW contains the original data and samples needed to estimate the coefficient in the accelerated bootstrap */
ARRAY DATA{200} D1-D200; /* array to store the original data, set up for a maximum of 200 data points, increase here and in next statement if necessary */
RETAIN D1-D200 /* tell SAS not to clear the array */
NPT 0; /* NPT counts the number of data points */
KEEP IBOOT /* IBOOT is the number of the bootstrap sample, 0 = original data */
ISKEW /* ISKEW is the number of sample in the skew data set, 0 = original data */
X;  /* X is an observed value */
INFILE CARDS EOF=WRITE; /* goto the section labeled write when done reading the data */
/* USER specified parameters */
NBOOT = 1000; /* number of bootstrap samples to draw */
ALPHA = 0.05; /* 1 - size of confidence interval */
/* read in original data and copy to boot data set */
IBOOT = 0;  /* iboot = 0 => working with original data */
INPUT X @@; /* read a data value, possibly more than one on a line */
NPT = NPT + 1; /* update the number of data values */
DATA{NPT} = X; /* store this value in the data array */
OUTPUT BOOT; /* output to the boot data set */
RETURN; /* and go get another input value */
/* after reading all the data, create the boot and skew data sets */
WRITE: /* generate nboot bootstrap samples and output them */
DO IBOOT = 1 TO NBOOT; /* loop executed once for each sample */
DO I = 1 TO NPT; /* loop executed once for each point */
PT = 1+FLOOR(NPT*UNIFORM(0)); /* randomly choose an integer from 1 to npt, with replacement */
X = DATA{PT}; /* find that data value in the array */
OUTPUT BOOT; /* and store it on the data set */
END;
END;
/* use SAS macro variables to store useful pieces of information between data steps */
CALL SYMPUT("NPT",NPT); /* the number of data points */
CALL SYMPUT("NBOOT",NBOOT); /* the number of bootstrap samples */
CALL SYMPUT("ALPHA",ALPHA); /* and the alpha level */

/*** The accelated bootstrap needs the coefficient a, which is the skewness of the influence function. Estimate this by the skewness of the empirical influence estimates. For each data point, compute the empirical influence by comparing the statistic calculated from the original data set repeated 5 times with the statistic calculated from 6 repeats of the target data point and five repeats of rest data set. ***/

T = 5; /* number of copies of each point */
CALL SYMPUT("T",T); /* store in a macro variable for future use */
DO ISKEW = 0 TO NPT; /* loop to repeat once for each data point, ISKEW = 0 indicates just multiplying original data, not adding any points */
IF ISKEW > 0 THEN DO; /* if need to add a point, do it */
X = DATA{ISKEW}; /* by finding that data value in the array */
OUTPUT SKEW; /* and storing it on the skew data set */
END;
DO J = 1 TO NPT; /* loop to add t copies of every data value */
X = DATA{J}; /* find the jth value from the array */
DO K = 1 TO T; /* and add t copies to the skew data set */
OUTPUT SKEW;
END;
END;
END;

/*** USER adds raw data here. Current input command is set up to read values of one variable. These may be stored 1 value per line or more than one value per line. An unlimited number of lines of data is permitted. If more than 200 data values, users needs to increase the size of the data array and increase the number of D variables used (see top of this data step). ***/

CARDS;
18 18 20 23 25 28
;

/*** Calculate Gini coefficient for the original data and each bootstrap sample. To bootstrap other statistics, replace the following PROC SORT and DATA step with a procedure that calculates a statistic BY IBOOT and stores it in a variable named BOOTX on an output data set named BOOTS. ***/

PROC SORT DATA=BOOT; /*sort each bootstrap sample so that the values are in increasing order */
BY IBOOT X;
DATA BOOTS; SET BOOT;
FILE 'BOOT6.DAT';
RETAIN GINISUM /* accumulates (2i-n-1)*X */
SUM /* accumulates X, so we can calculate mean */
IPT /* index of data value, 1=smallest, NPT=largest */
NPT; * /number of values in this sample */
BY IBOOT; /* need to do these calculations separately for each bootstrap sample */
IF FIRST.IBOOT THEN DO; /* if this is the first value in the sample initialize sums and counters */
GINISUM = 0;
IPT = 0;
SUM = 0;
NPT = SYMGET("NPT");
END;
IPT = IPT + 1; /* then add 1 to the number of data points */
SUM = SUM + X; /* add the value to the sum */
GINISUM = GINISUM + (2*IPT - (NPT + 1))*X; /* and add (2i-n-1)X to its sum */
IF LAST.IBOOT THEN DO; /* if just did the last value in the sample */
BOOTX = GINISUM/((NPT-1)*SUM); /* calculate Gini coefficient for sample */
OUTPUT; /* store on boots data set */
IF IBOOT = 0 THEN CALL /* if this was the Gini coeff for the observation data */
SYMPUT("SAMPLE",BOOTX); /* store it it a macro variable */
PUT BOOTX;
END;
KEEP BOOTX IBOOT; /* BOOTS data set only needs the estimate from the bootstrap sample (BOOTX) and the number of the bootstrap sample (IBOOT), needed to keep the original sample (IBOOT=0) separate from the bootstrap replicates (IBOOT > 0) */

/*** Calculate the Gini coefficient for each set of data in SKEW. Each unique set identifer by a unique value of ISKEW. N.B. This is needed only to compute the a coefficient for the accelerated bootstrap CI. The algorithm identical to that used for the bootstrap replicates above, except that the number of points in each skew data is n*t or n*t+1 ***/

PROC SORT DATA=SKEW;
BY ISKEW X;
DATA SKEWS; SET SKEW;
RETAIN GINISUM IPT SUM NPT;
BY ISKEW;
IF FIRST.ISKEW THEN DO;
GINISUM = 0;
IPT = 0;
SUM = 0;
NPT = SYMGET("NPT")*SYMGET("T"); /* number of points in unperturbed skew data */
IF ISKEW = 0 THEN NPT = NPT + 1; /* one extra point in each perturbed */
END;
IPT = IPT + 1;
SUM = SUM + x;
GINISUM = GINISUM + (2*IPT - (NPT + 1))*X;

IF LAST.ISKEW THEN DO;
BOOTX = GINISUM/((NPT-1)*SUM);
OUTPUT;
END;
KEEP BOOTX ISKEW;
/* calculate influence function for each data point; again, only needed to get the coefficient for the accelerated bootstrap */
DATA _NULL_; /* will not create an output data set */
RETAIN S0 /* S0 is the Gini coefficient in the original data */
U3 0 /* U3 accumulates third moments of I */
U2 0; /* U2 accumulates second moments of I */
SET SKEWS; /* work with the Gini coeff calculated for each perturbed data set */
NPT = SYMGET("NPT"); /* NPT = number of points in original data, used to figure out number of skew data sets* /
IF ISKEW = 0 THEN S0 = BOOTX; /* remember the original Gini value */
ELSE DO; /* if value from a perturbed data set */
INFL = (BOOTX - S0)/(SYMGET("T")+1); /* calculate influence */
U3 = U3 + INFL**3; /* accumulate third moments */
U2 = U2 + INFL**2; /* accumulate moments */
END;
IF ISKEW = NPT THEN DO; /* if this is the last perturbed data set */
A = U3/(6*U2**(3/2)); /* calculate skewness of influence */
CALL SYMPUT("A", A); /* and store it in a macro variable */
END;
/* calculate the percentiles for the bias-corrected and accelerated bootstraps */
DATA _NULL_; SET BOOTS;
RETAIN M0 /* M0: Gini coefficient in original data set */
P0; /* P0 will count number of bootstrap Gini coefficients less than M0.
This is used to get the Z0 coefficient for bias correction */
IF IBOOT = 0 THEN M0 = BOOTX; /* store the original Gini coefficient */
ELSE DO; /* if a bootstrap replicate */
IF BOOTX < M0 THEN /* see if less than original value */
P0 = P0 + 1; /* if so, add 1 to P0 */
END;
/* calculations for the percentiles */
IF IBOOT = SYMGET("NBOOT") THEN DO; /* after looking at all bootstrap replicates */
ALPHA = SYMGET("ALPHA"); /* remember what the alpha level was */
A = SYMGET("A"); /* calculated for A coefficient */
Z0 = PROBIT(P0/SYMGET("NBOOT")); /* calculate Z0 coefficient */
ZA2 = PROBIT(1-ALPHA/2); /* use alpha to calculate critical value */
PL = 100*ALPHA/2; /* percentiles for percentile confidence interval */
PU = 100*(1-ALPHA/2); /* these depend just on alpha */
PLBC = 100*PROBNORM(2*Z0-ZA2); /* percentiles for bias-corrected CI */
PUBC = 100*PROBNORM(2*Z0+ZA2); /* these depend on Z0 */
PLBCA = 100*PROBNORM(Z0+(Z0-ZA2)/(1-A*(Z0-ZA2))); /* percentiles for accelerated method */
PUBCA = 100*PROBNORM(Z0+(Z0+ZA2)/(1-A*(Z0+ZA2))); /* these depend on Z0 and A */
CALL SYMPUT("PL",PL); /* store percentiles in macro variables to be used by PROC UNIVARIATE */
CALL SYMPUT("PU",PU);
CALL SYMPUT("PLBC",PLBC);
CALL SYMPUT("PUBC",PUBC);
CALL SYMPUT("PLBCA",PLBCA);
CALL SYMPUT("PUBCA",PUBCA);
CALL SYMPUT("CI",100*(1-ALPHA)); /* also store 1-alpha value and the Z0 value to use in a title */
CALL SYMPUT("Z0",Z0);
END;
PROC UNIVARIATE DATA=BOOTS; /* find the appropriate percentiles of the bootstrap distribution */
WHERE IBOOT > 0; /* do not include sample value */
VAR BOOTX;
OUTPUT OUT=PERC /* store percentiles in a data set */
PCTLPTS = &PL &PU &PLBC &PUBC &PLBCA &PUBCA /* specifies desired percentiles */
PCTLNAME=CI_L CI_U BC_L BC_U A_L A_U /* and their variable names */
PCTLPRE = BOOT; /* needed, but ignored by SAS ver 6.07 */

PROC PRINT NOOBS; /* print the values for each confidence interval */
TITLE1 "&CI PERCENT BOOTSTRAP CONFIDENCE INTERVALS";
TITLE2 "SAMPLE VALUE: &SAMPLE";
TITLE3 "&NBOOT BOOTSTRAP SAMPLES, CONSTANTS: Z0 = &Z0, A = &A";
TITLE4 "PERCENTILES FOR THE BC INTERVAL ARE: &PLBC, &PUBC";
TITLE5 "PERCENTILES FOR THE ACCELERATED INTERVAL ARE: &PLBCA, &PUBCA";