/*
	How does one determine when to use SET as opposed to using 
	MERGE?  If we think of SAS data sets as blocks, then using 
	SET means we are stacking the blocks one on top of the other.  
	Using MERGE means we are aligning the blocks side by side.

	In order to use a SET statement correctly, we need both data 
	sets to have most, if not all, variables in common.  In other 
	words, we are concatenating observations into a single data 
	set.  To use MERGE correctly, we generally have different 
	variables in the data sets, with possibly one or more variables 
	in common to use as "key" variables.  With MERGE, we are 
	concatenating variables into a single data set.

	What happens when we use SET when we should have used MERGE?
*/

options ls=100;
filename foo1 url "http://www.uvm.edu/~abh/stat295/datasets/fitness2.dat";
filename foo2 url "http://www.uvm.edu/~abh/stat295/datasets/bmi2.dat";

data fitness; 
	 infile foo1;
     input id Sex $1. Age Weight Oxygen RunTime RestPulse RunPulse MaxPulse;
run;

proc print;
title "Fitness Data";
run;

data bmi; 
	 infile foo2;
     input id Sex $1. bmi;
run;

proc print;
title "BMI Data";
run;

data fit_bmi;
     set fitness bmi;
run;

proc print;
title "Fitness and BMI data: using SET instead of MERGE";
run;

/*
	We end up with a data set that has lots of missing data.
	
	What happens when we use MERGE when we should have used SET?
*/

data males females miscoded;
     set fitness;
	 if sex = "M" then output males;
	 else if sex = "F" then output females;
	 else output miscoded;
run;

proc print data=males;
title "Fitness Data - Males only";
run;

proc print data=females;
title "Fitness Data - Females only";
run;

data fitness;
     merge males females;
run;

proc print;
title "Fitness data: using MERGE instead of SET";
run;

/*
	We end up with only the females in the merged data set.

	What if we had included a BY statement?
*/

data fitness;
     merge males females;
     by id;
run;

proc print;
title "Fitness data: using MERGE instead of SET";
run;

/*
	We end up with the correct data set when we used a BY 
	statement.  This only works correctly because there are 
	no IDs in common between the two data sets, so the merged 
	data set includes all IDs.  In other words, we were lucky 
	that this worked.  
*/