### Code for Week 02 lab for Stat 430 1203, Sept. 17, 2012. ### By Jack Davis jackd@sfu.ca www.sfu.ca/~jackd ### The most important thing about programs are the comments # anything showing after a # is a comment # it lets you write explanations of what you're doing before or even in the middle of you code ### I like to put three #s before most comments so they stand out more. ########## # sample() function ### Pick a number from 1 to 10 sample(1:10, size=1) ### Pick four numbers from 1 to 10 sample(1:10, size=4) ### Pick four numbers from 1 to 10, you can use the same number twice sample(1:10, size=4, replace=TRUE) ### You can have more than one of the same number ### In this case you'll end up with multiples in your sample some times even though ### replacement is FALSE. temp = c(1,1,1,1,1,2,2,2,2,2) sample(temp, size=4) ### Letters work too ABC = c("A","B","C","Horse") sample(ABC, size=7, replace=TRUE) ### You can't make the size bigger than your list if you're not using replace sample(1:10, size=40) ########## # print() function ### The print function makes R print to the screen whatever is inside the ( ) print("Hello World!") ### We can also assign variables to print course = "Stat 430" print(course) today <- "Sept 16 2012" print(today) ### Note that = and <- do exactly the same thing ### We could also make a list of numbers or words (strings) and print it ### c( , , , ) is used to make a list from scratch list1 = 1:5 list2 <- c("A","B","C","D") list3 = c(4,-1,10^12,6*8) print(list1) print(list2) print(list3) ########## ### For loops for(k in 1:5) { ## things in here will happen 5 times } ## Everything inside the { } will happen for each number in the list after the word "in". ## The list 1:5 has 5 numbers, so everything happens five times. for(k in 1:5) { print("That's one iteration") } ## The counting variable can itself be used in an iteration if we want. for(count in 1:5) { print(count * 20) } ## Finally, the list doesn't have to be counting up from one, either, it can be anything list of numbers number_list = c(6,4,1,8) for(count in number_list) { print(count * 10) } ########## # Factorial() and choose() ### Say we have a sample of size 8 and another of size 12. ### How many combinations do we need to look at? ### 20! / 12!8! factorial(20)/ (factorial(12) * factorial(8) ) ## or... choose(20,8) ## or... choose(20,12) ### If we had samples of size 14 and 17 choose(31,14) ### 265 million combinations ############### ### Randomization Test - Single Run ### Define the two samples group1 = c(17, 15, 20, 27, 22, 20, 20, 18) group2 = c(18, 15, 9, 18, 13, 17 ,7, 11, 12, 15, 26, 24) pool = c(group1, group2) ## This makes a list of both groups together, a pool of 20 values. ### Next, we randomly select one of the 125,970 combinations to go in ### decide which 8 of the 20 are going to be group1 samp = sample(1:20,size=8) ## Random sample of 8 numbers from 1 to 20 rndm_grp1 = pool[samp] ## 8 random numbers from the pool of 20 numbers above rndm_grp2 = pool[-samp] ## Every element of the pool that was NOT in the sample of 8 numbers ## i.e. The other 12 numbers. Notice the - sign before samp. ## Let's say our alternative hypothesis was that group1's mean is higher than group2's mean ## If this is true, then it should be larger than a random difference between groups most of the time. group_diff = mean(group1) - mean(group2) random_diff = mean(rndm_grp1) - mean(rndm_grp2) ### Is it? group_diff > random_diff ################ ### Randomization Test - Monte Carlo ### Set up the group difference and a counter before we start random_was_bigger = 0 group_diff = mean(group1) - mean(group2) ### For each of 10000 iterations... for(k in 1:10000) { ### Make the combination and assign the values to two groups samp = sample(1:20,size=8) rndm_g1 = pool[samp] rndm_grp2 = pool[-samp] ### Find the differences in the means of the random groups random_diff = mean(rndm_grp1) - mean(rndm_grp2) ### If the mean difference in the random groups was the greater difference... if(random_diff > group_diff) { ### Count 1 more time that the random group was bigger random_was_bigger = random_was_bigger + 1 } } ### The p-value is the proportion of times the random group had a bigger mean difference random_was_bigger / 10000 # divided by 10000 because there were 10000 iterations.