##################################################### #Network sampling ##################################################### #http://socialcomputing.asu.edu/datasets/Flickr # 4 files are included: # # 1. nodes.csv # -- it's the file of all the users. This file works as a dictionary of all the users in this data set. It's useful for fast reference. It contains # all the node ids used in the dataset # # 2. groups.csv # -- it's the file of all the groups. It contains all the group ids used in the dataset # # 3. edges.csv # -- this is the friendship network among the users. The user's friends are represented using edges. # Since the network is symmetric, each edge is represented only once. Here is an example. # # 1,2 # # . # This means user with id "1" is friend with user id "2". # # 4. group-edges.csv # -- the user-group membership. In each line, the first entry represents user, and the 2nd entry is the group index. # # If you need to know more details, please check the relevant papers and code: # http://www.public.asu.edu/~ltang9/social_dimension.html #some other resources for network data #https://snap.stanford.edu/data/egonets-Facebook.html #https://snap.stanford.edu/data/#communities #http://www.kdnuggets.com/2014/08/interesting-social-media-datasets.html #https://snap.stanford.edu/data/ #the raw data contain users(nodes), groups where users are members #friendship relationship between users (edges) and group membership for #each user (group_edges ) nodes=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/nodes.csv') groups=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/groups.csv') edges=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/edges.csv') group_edges=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/group-edges.csv') #extract the population of N=100 (procssing all data reqires much more computing resources) #so in the next few lines of code we are going to extract data on only N=100 users N=100 yi=get(load('yi.RData')) #variable of interest is binary denoting whether the users have a dropbox account #we will estimate the population total of users who have dropbox account tau=sum(yi) #observational units (users) ds_nodes=nodes[1:N,1] ind=which(group_edges[,1] %in% ds_nodes & group_edges[,2] %in% ds_nodes) ds_group_edges=group_edges[ind,] #selection units ind1=which(groups[,1] %in% ds_group_edges[,2]) ds_groups=groups[ind1,1] M=length(ds_groups) mu=tau/M ind2=which(edges[,1] %in% ds_nodes & edges[,2] %in% ds_nodes) ds_edges=edges[ind2,] ############################################################################## #at this point the data on 1:N users are extracted ############################################################################## #Next fill the adjustancy matrix A=matrix(0,N+1,N+1) for (x in (1:N)){ A[ds_edges[x,1],ds_edges[x,2]]=1 } #make A symetric A=A+t(A) ############################################################# #tau_multiplicity is the function that draws snowball samples #and estmiates tau using multiplicity estimator ############################################################# tau_multiplicity=function(n,M,A,yi,ds_group_edges,ds_edges,ds_nodes){ #draw n selection units (from the groups) #students in each group report on their friends in the other groups selUnits=sample(1:M,n) w=rep(NA,length(selUnits)) for (s in 1:length(selUnits)){ #get the students in the selected group indG=which(ds_group_edges[,2] == ds_groups[selUnits[s]]) st=ds_group_edges[indG,1] #retreive the friends of the selected students #this creates sub-networks for each of the selected students ws_subNtwrk=rep(NA,length(st)) for (i in (1:length(st))){ #since friendship is a symetric relationship #we have to check both rows and columns of the adjustancy matrix NextWave=list() NextWave[[1]]=unique(c(which(A[st[i],]==1),which(A[,st[i]]==1))) #Find the wave samples if (!(length(NextWave[[1]])==0)){ for (l in 2:100){ indstNew=list() for (k in NextWave[[l-1]]){ indstNew[[k]]=c(which(A[ds_nodes[k],]==1),which(A[,ds_nodes[k]]==1))} NextWave[[l]]=setdiff(unique(unlist(indstNew)),NextWave[[l-1]]) if (length(NextWave[[l]])==0) break }} sub_ntwrk_i=c(st[i],ds_nodes[unique(unlist(NextWave))]) #the size of the sub-network for st[i] mi=length(sub_ntwrk_i) #find which yi correspond to the selected sub-network indy=which(ds_nodes %in% sub_ntwrk_i) #calculate wi for the selected sub-network ws_subNtwrk[i]=sum(yi[indy])/mi #equation 15.3 page 203 } w[s]=sum(ws_subNtwrk) #equation 15.3 page 203 } (tau_muliplicity=(M/n)*sum(w)) #equation 15.4 page 203 return(tau_muliplicity) } #this is very slow b=10 tau_m=replicate(b,tau_multiplicity(n=20,M=M,A=A,yi=yi,ds_group_edges=ds_group_edges,ds_edges=ds_edges,ds_nodes=ds_nodes)) mean(tau_m) hist(tau_m,nclass=10) abline(v=tau,col='red') #try parallel programing using 'parallel' package library(parallel) #detect number of cores on your machine nCores=detectCores(all.tests = FALSE, logical = TRUE) #create cluster with number of cores nCores #parSapply is a parallel version of sapply function cl=makeCluster(nCores) b=5000 #start the timer t1=proc.time()[1] #clusterExport exports the environment such as function tau_multiplicity #the data objects yi,ds_group_edges,ds_edges,ds_nodes, constants n,M ,N, matrix A #with this line it exports everything to the parallel processor clusterExport(cl,varlist=ls(),envir = environment()) #parSapply is analogue function to the sapply. Both run functions for a given vector #for example 1:b and combine the results in a vector. So tau_m will be e a vector and we do not #have to do additional transformations tau_m=parSapply(cl,1:b,function(x) {tau_multiplicity(n=20,M,A,yi,ds_group_edges,ds_edges,ds_nodes)}) stopCluster(cl) #get the elapsed time (t=proc.time()[1]-t1)/60 mean(tau_m) var(tau_m) hist(tau_m) abline(v=tau,col='red') #use sql within R. #sql requires data frames and returns data frames.It also requires #well defined column names library(sqldf) colnames(nodes)='Students' colnames(groups)='Gr' colnames(edges)=c('Friend1','Friend2') colnames(group_edges)=c('Friend','Gr') df1=sqldf('select group_edges.* from group_edges where group_edges.Gr=190') df=sqldf('select group_edges.* from groups join group_edges on groups.Gr=group_edges.Gr where groups.Gr=190') #df and df1 are data frames colnames(df) df_friend=sqldf('select * from df where df.Friend= 78352')