#####################################################
#Network sampling
#####################################################
#http://socialcomputing.asu.edu/datasets/Flickr
# 4 files are included:
#   
#   1. nodes.csv
# -- it's the file of all the users. This file works as a dictionary of all the users in this data set. It's useful for fast reference. It contains
# all the node ids used in the dataset
# 
# 2. groups.csv
# -- it's the file of all the groups. It contains all the group ids used in the dataset
# 
# 3. edges.csv
# -- this is the friendship network among the users. The user's friends are represented using edges. 
# Since the network is symmetric, each edge is represented only once. Here is an example. 
# 
# 1,2
# 
# .
# This means user with id "1" is friend with user id "2".
# 
# 4. group-edges.csv
# -- the user-group membership. In each line, the first entry represents user, and the 2nd entry is the group index. 
# 
# If you need to know more details, please check the relevant papers and code:
#   http://www.public.asu.edu/~ltang9/social_dimension.html
#some other resources for network data
#https://snap.stanford.edu/data/egonets-Facebook.html
#https://snap.stanford.edu/data/#communities
#http://www.kdnuggets.com/2014/08/interesting-social-media-datasets.html
#https://snap.stanford.edu/data/

#the raw data contain users(nodes), groups where users are members
#friendship relationship between users (edges) and  group membership for 
#each user (group_edges )
nodes=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/nodes.csv')
groups=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/groups.csv')
edges=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/edges.csv')
group_edges=read.csv('D:/TA/STAT410/2016/Flickr-dataset/Flickr-dataset/data/group-edges.csv')

#extract the population of N=100  (procssing all data reqires much more computing resources)
#so in the next few lines of code we are going to extract data on only N=100 users
N=100
yi=get(load('yi.RData')) #variable of interest is binary denoting whether the users have a dropbox account
#we will estimate the  population total of users who have dropbox account
tau=sum(yi)

#observational units (users)
ds_nodes=nodes[1:N,1]

ind=which(group_edges[,1] %in% ds_nodes & group_edges[,2] %in% ds_nodes)
ds_group_edges=group_edges[ind,]

#selection units
ind1=which(groups[,1] %in% ds_group_edges[,2])
ds_groups=groups[ind1,1]
M=length(ds_groups)
mu=tau/M


ind2=which(edges[,1] %in% ds_nodes & edges[,2] %in% ds_nodes)
ds_edges=edges[ind2,]
##############################################################################
#at this point the data on 1:N users are extracted
##############################################################################
#Next fill the adjustancy matrix
A=matrix(0,N+1,N+1)
for (x in (1:N)){
  A[ds_edges[x,1],ds_edges[x,2]]=1
}
#make A symetric
A=A+t(A)

#############################################################
#tau_multiplicity is the function that draws snowball samples
#and estmiates tau using multiplicity estimator
#############################################################
tau_multiplicity=function(n,M,A,yi,ds_group_edges,ds_edges,ds_nodes){
  
  #draw n selection units (from the groups)
  #students in each group report on their friends in the other groups
  selUnits=sample(1:M,n)
  w=rep(NA,length(selUnits))
  
  for (s in 1:length(selUnits)){
    #get the students in the selected group
    indG=which(ds_group_edges[,2] == ds_groups[selUnits[s]])
    st=ds_group_edges[indG,1]
    
    #retreive the friends of the selected students
    #this creates sub-networks for each of the selected students
    ws_subNtwrk=rep(NA,length(st))
    for (i in (1:length(st))){
      #since friendship is a symetric relationship
      #we have to check both rows and columns of the adjustancy matrix
      NextWave=list()
      NextWave[[1]]=unique(c(which(A[st[i],]==1),which(A[,st[i]]==1)))
      #Find the wave samples
      if (!(length(NextWave[[1]])==0)){
        for (l in 2:100){
          indstNew=list()
          for (k in NextWave[[l-1]]){
            indstNew[[k]]=c(which(A[ds_nodes[k],]==1),which(A[,ds_nodes[k]]==1))}
          
          NextWave[[l]]=setdiff(unique(unlist(indstNew)),NextWave[[l-1]])
          if (length(NextWave[[l]])==0) break
        }}
      sub_ntwrk_i=c(st[i],ds_nodes[unique(unlist(NextWave))])
      
      #the size of the sub-network for st[i]
      mi=length(sub_ntwrk_i)
      #find which yi correspond to the selected sub-network
      indy=which(ds_nodes %in% sub_ntwrk_i)
      #calculate wi for the selected sub-network
      ws_subNtwrk[i]=sum(yi[indy])/mi  #equation 15.3 page 203
    }
    w[s]=sum(ws_subNtwrk) #equation 15.3 page 203
  }
  (tau_muliplicity=(M/n)*sum(w)) #equation 15.4 page 203
  return(tau_muliplicity)
}



#this is very slow
b=10
tau_m=replicate(b,tau_multiplicity(n=20,M=M,A=A,yi=yi,ds_group_edges=ds_group_edges,ds_edges=ds_edges,ds_nodes=ds_nodes))
mean(tau_m)
hist(tau_m,nclass=10)
abline(v=tau,col='red')


#try parallel programing using 'parallel' package
library(parallel)
#detect number of cores on your machine
nCores=detectCores(all.tests = FALSE, logical = TRUE)
#create  cluster with number of cores nCores
#parSapply is a parallel version of sapply function
cl=makeCluster(nCores)
b=5000
#start the timer
t1=proc.time()[1]
#clusterExport exports the environment such as function tau_multiplicity
#the data objects yi,ds_group_edges,ds_edges,ds_nodes, constants n,M ,N, matrix A
#with this line it exports everything to the parallel processor
clusterExport(cl,varlist=ls(),envir = environment())
#parSapply is analogue function to the sapply. Both run functions for a given vector
#for example 1:b and combine the results in a vector. So tau_m will be e a vector and we do not
#have to do additional transformations
tau_m=parSapply(cl,1:b,function(x) {tau_multiplicity(n=20,M,A,yi,ds_group_edges,ds_edges,ds_nodes)})
stopCluster(cl)
#get the elapsed time
(t=proc.time()[1]-t1)/60


mean(tau_m)
var(tau_m)
hist(tau_m)
abline(v=tau,col='red')


#use sql within R.
#sql requires data frames and returns data frames.It also requires 
#well defined column names
library(sqldf)

colnames(nodes)='Students'
colnames(groups)='Gr'
colnames(edges)=c('Friend1','Friend2')
colnames(group_edges)=c('Friend','Gr')

df1=sqldf('select group_edges.*  from group_edges where group_edges.Gr=190')
df=sqldf('select group_edges.*  from groups join group_edges on groups.Gr=group_edges.Gr where groups.Gr=190')
#df and df1 are data frames
colnames(df)
df_friend=sqldf('select * from df where df.Friend= 78352')