############################################################################ # HYPERGEOMETRIC DISTRIBUTION IN R ############################################################################ # X- number of successes m in n draws without replacement from a finite # population of size N which contains exactly M successes ############################################################################ #example: a box contains 20 red , 12 blue balls. # let red balls be a 'success', blue a 'failure' # we are drawing 10 balls without replacement and # count the number of red balls (successes). r=20 b=12 n=10 ############################################################################ # generate a hypergeometric random variable (r,b,n) # and save it in a variable X ############################################################################ # one realization from negative binomial distribution rhyper(1,r,b,n) (X=rhyper(10000,r,b,n)) #print the variable X values ############################################################################ ############################################################################ #Make a histogram using the generated variable X ############################################################################ #Once we have generated the hypergeomteric random variable (r,b,n) #we can make plots, calculate quantiles #check the documentation about hist function ?hist #use nclass to define the number of bins of the histogram hist(X,nclass=25,col='purple',main='Histogram of X') hist(X,nclass=100,col='purple',main='Histogram of X') ############################################################################ # Make plots against a predifined grid # using dhyper ############################################################################ #define the grid grid = seq(0, 10, by = 1) #evaluate log binomial against the defined grid and make a plot evalXlog=dhyper(grid, r, b,n, log = TRUE) plot(grid, evalXlog, type = "l", ylab = "log density", main = "log density from dhyper", col='red') #evaluate binomial against the defined grid and make a plot evalX=dhyper(grid, r, b,n, log = FALSE) plot(grid, evalX,col='blue',type = "l", ylab = " density", main = "density from dhyper") ############################################################################ ############################################################################ # dhyper is evaluating the density, so we can use it to calculate # probabilities that X is exactly some value, or it is smaller or bigger # Example: Calculate the probability that there were 5 red balls in 10 draws # that is we need to calculate # P(X=5)= (factorial(r)/(factorial(x)*factorial(r-x) ) * factorial(b)/( factorial(n-x) *factorial(b-n+x)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) )) ############################################################################ #Calculate P(X=5) using the hard coded PMF of binomial x=5 (probXis2fla= (factorial(r)/(factorial(x)*factorial(r-x) ) * factorial(b)/( factorial(n-x) *factorial(b-n+x)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) ))) #Calculate P(X=2) using the R function dbinom (probXis2= dhyper(x, r, b,n, log = F)) ############################################################################ ############################################################################ ##calculate P( 6 <=X<= 8) #Several ways to do it! I will show you 3 today! ############################################################################ #1. #use the hard coded PMF of negative binomial (probXbtw6and8fla= (factorial(r)/(factorial(6)*factorial(r-6) ) * factorial(b)/( factorial(n-6) *factorial(b-n+6)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) ))+ (factorial(r)/(factorial(7)*factorial(r-7) ) * factorial(b)/( factorial(n-7) *factorial(b-n+7)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) ))+ (factorial(r)/(factorial(8)*factorial(r-8) ) * factorial(b)/( factorial(n-8) *factorial(b-n+8)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) ))) #2. #We can write a parametrized function that can be recalled each time we need it. #This techique will be helful when you work when the distribution function #such as dbinom not available in R #This is just a more useful technique than the first one. #It makes it easier to calculate much more probabilities like P( 6 <=X<= 17 ) myDistHyper=function(numb,r,b,n){ (factorial(r)/(factorial(numb)*factorial(r-numb) ) * factorial(b)/( factorial(n-numb) *factorial(b-n+numb)))/ (factorial(r+b)/ (factorial(n)*factorial(r+b-n) )) } (probXbtw6and8MyFla=myDistHyper(6,r,b,n)+myDistHyper(7,r,b,n)+myDistHyper(8,r,b,n)) #or we can write a for loop to make a summation probXbtw6and8MyFla2=0 for (i in (6:8)){ probXbtw6and8MyFla2=probXbtw6and8MyFla2+myDistHyper(i,r,b,n) } # Note: # In general R is very slow when it comes to using for loop # There are more sofisticated functions such as apply, sapply, lapply.. # that can be used to avoid the for loop. We will get there soon. #3. #use dhyper #provided a vector 6:8 #dhyper will calculate probabilities for each values in the vector 6:8 dhyper(6:8, r, b,n, log = F) #so to calculate prob P( 6 <=X<= 8) we will need to sum up probabilities #returned by dhyper evaluated at each of the values 6:8 (probXbtw6and8=sum(dhyper(6:8, r, b,n, log = F))) ############################################################################ #calculate 75 percentile ############################################################################ qhyper(0.75,r,b,n) ############################################################################ #END ############################################################################