Delete observations from a SAS data set when all or most of variables has missing data
/* Sample data set */
data missing;
input n1 n2 n3 n4 n5 n6 n7 n8 c1 $ c2 $ c3 $ c4 $;
datalines;
1 . 1 . 1 . 1 4 a . c .
1 1 . . 2 . . 5 e . g h
1 . 1 . 3 . . 6 . . k i
1 . . . . . . . . . . .
1 . . . . . . . c . . .
. . . . . . . . . . . .
;
run;
*If you want to delete observation if the data for every variable is missing then use the following code;
*Approach 1: Using the coalesce option inside the datastep;
data drop_misobs;
set missing;
if missing(coalesce(of _numeric_)) and missing(coalesce(of _character_)) then delete;
run;
Pros:
*Simple code
Cons;
*This code doesn't work if we want to delete observation based on specific variables and not all of them.
*Approach 2:Using N/NMISS option inside the datastep;
data drop_missing;
set missing;
*Checks the Non missing values using ;
if n(n1, n2, n3, n4, n5, n6, n7, n8, c1, c2, c3, c4)=0 then delete;
run;
data drop_missing;
set missing;
*Checks the missing values using nmiss option;
if nmiss(n1, n2, n3, n4, n5, n6, n7, n8, c1, c2, c3, c4)=12 then delete; *12 is the total number of variables in the dataset missing.;
run;
*If you want to delete records based on few variables and don't want to type all the variable names in the IF-THEN clause use the following code;
*Task: Delete observations from the dataset if all variables in the dataset except (N1 and C1) has missing data;
proc contents data=missing out=contents(keep=memname name);
run;
*Create a macro variable names with list of variable names in the dataset;
proc sql;
select distinct name into:names separated by ','
from contents(where=(upcase(name) ^in ('N1','C1'))) where memname='MISSING'; *Excluding 2 variables in the dataset;
quit;
data remove_missing;
set missing;
if n(&names) lt 1 then delete;
run;
('’)
data missing;
input n1 n2 n3 n4 n5 n6 n7 n8 c1 $ c2 $ c3 $ c4 $;
datalines;
1 . 1 . 1 . 1 4 a . c .
1 1 . . 2 . . 5 e . g h
1 . 1 . 3 . . 6 . . k i
1 . . . . . . . . . . .
1 . . . . . . . c . . .
. . . . . . . . . . . .
;
run;
*If you want to delete observation if the data for every variable is missing then use the following code;
*Approach 1: Using the coalesce option inside the datastep;
data drop_misobs;
set missing;
if missing(coalesce(of _numeric_)) and missing(coalesce(of _character_)) then delete;
run;
Pros:
*Simple code
Cons;
*This code doesn't work if we want to delete observation based on specific variables and not all of them.
*Approach 2:Using N/NMISS option inside the datastep;
data drop_missing;
set missing;
*Checks the Non missing values using ;
if n(n1, n2, n3, n4, n5, n6, n7, n8, c1, c2, c3, c4)=0 then delete;
run;
data drop_missing;
set missing;
*Checks the missing values using nmiss option;
if nmiss(n1, n2, n3, n4, n5, n6, n7, n8, c1, c2, c3, c4)=12 then delete; *12 is the total number of variables in the dataset missing.;
run;
*If you want to delete records based on few variables and don't want to type all the variable names in the IF-THEN clause use the following code;
*Task: Delete observations from the dataset if all variables in the dataset except (N1 and C1) has missing data;
proc contents data=missing out=contents(keep=memname name);
run;
*Create a macro variable names with list of variable names in the dataset;
proc sql;
select distinct name into:names separated by ','
from contents(where=(upcase(name) ^in ('N1','C1'))) where memname='MISSING'; *Excluding 2 variables in the dataset;
quit;
data remove_missing;
set missing;
if n(&names) lt 1 then delete;
run;
('’)