--- title: "Partially Linear Models with Endogeneity: a conditional moment based approach" author: "Bertille Antoine and Xiaolin Sun" date: "2024-03-08" output: pdf_document: default html_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r} ##install.packages(c("dplyr","stats","MASS","boot","ggplot2","gridExtra","readxl")) library(dplyr) library(stats) library(MASS) library(boot) library(ggplot2) library(gridExtra) library(readxl) ``` ## 1. define the functions for estimating beta and standard error of beta. ### 1.1 2SLS with Z part linear ```{r} beta.2sls.het= function (nobs, X, Z, Y, W, withZ){ #nobs: number of observations #X: the endogenous variables #Z: the exogenous variables #Y: outcome #W: the instrumens #withZ: if withZ = 1, the controls Z is included in the regression c= matrix(1,nobs,1) if (withZ==1) { Xmat = as.matrix(data.frame(c,X,Z)) Zmat = as.matrix(data.frame(c,W,Z)) } else { Xmat = as.matrix(data.frame(c,X)) Zmat = as.matrix(data.frame(c,W)) } proj = Zmat %*% solve(t(Zmat)%*% Zmat) %*% t(Zmat) beta = solve(t(Xmat)%*% proj %*% Xmat) %*% (t(Xmat)%*% proj %*% Y) residual = Y - Xmat %*% beta ncolX = dim(Zmat)[2] sigma = mean(residual^2) mat1 = residual^2 %*% matrix(1,1,ncol = ncolX) A = t(Zmat * mat1) %*% Zmat varcov = solve(t(Xmat) %*% proj %*% Xmat) %*% t(Xmat) %*% Zmat %*% solve(t(Zmat) %*% Zmat) %*% A %*% solve(t(Zmat) %*% Zmat) %*% t(Zmat) %*% Xmat %*% solve(t(Xmat) %*% proj %*% Xmat) var=diag(varcov) #extract the diagonal se = sqrt(var) seMat = matrix(se, dim(Xmat)[2],1) tstat = beta/seMat output =data.frame(beta, seMat, tstat) #produce the beta (estimate), the seMat (standard error under heteroscedasticity), and the tstat (t-statistics) return (output) } ``` ### 2. gmm with Z part linear, instruments W and W^2 ```{r} beta.gmm.het= function (nobs, X, W, Z, Y){ c= matrix(1,nobs,1) Xmat = as.matrix(data.frame(c,X,Z)) Zmat = as.matrix(data.frame(c,W,scale(W^2),Z)) proj = Zmat %*% solve(t(Zmat)%*% Zmat) %*% t(Zmat) beta = solve(t(Xmat)%*% proj %*% Xmat) %*% (t(Xmat)%*% proj %*% Y) residual = Y - Xmat %*% beta ncolX = dim(Zmat)[2] mat1 = residual^2 %*% matrix(1,1,ncol = ncolX) A = t(Zmat * mat1) %*% Zmat varcov = solve(t(Xmat) %*% proj %*% Xmat) %*% t(Xmat) %*% Zmat %*% solve(t(Zmat) %*% Zmat) %*% A %*% solve(t(Zmat) %*% Zmat) %*% t(Zmat) %*% Xmat %*% solve(t(Xmat) %*% proj %*% Xmat) var=diag(varcov) #extract the diagonal se = sqrt(var) seMat = matrix(se, dim(Xmat)[2],1) tstat = beta/seMat output =data.frame(beta, seMat, tstat) #produce the beta (estimate), the seMat (standard error under heteroscedasticity), and the tstat (t-statistics) return (output) } ``` ### 1.2 Nadaraya-Watson kernel regression estimate Since the ksmooth function provided by R can only handle the case of a Z, write a function that copes with the case of a vector of Z (size 2). ```{r} mNW <- function(x, X, Y, h, K=dnorm) { # Arguments # x: evaluation points # X: vector (size n) with the predictors # Y: vector (size n) with the response variable # h: bandwidth # K: kernel h1 = sd(X)* h # Matrix of size n x length(x) Kx = sapply(X, function(Xi) K((x - Xi) / h1, sd= 1.4826/4) / h1) #produce a matrix of k((xi - Xj)) # Weights W = Kx / rowSums(Kx) #1st row of Kx is divided by the first element of rowSums(Kx). # Means at x ("drop" to drop the matrix attributes) drop(W %*% Y) } ``` ```{r} mNW2 <- function(x1, x2, X1, X2, Y, h, K=dnorm) { # Arguments # x: evaluation points # X: vector (size n) with the predictors # Y: vector (size n) with the response variable # h: bandwidth # K: kernel h1 = sd(X1) * h h2 = sd(X2) * h # Matrix of size n x length(x) Kx1 = sapply(X1, function(Xi) K((x1 - Xi) / h1, sd= 1.4826/4) / h1) Kx2 = sapply(X2, function(Xi) K((x2 - Xi) / h2, sd= 1.4826/4) / h2) Kx = Kx1 * Kx2 # Weights W <- Kx / rowSums(Kx) # 1st row of Kx is divided by the first element of rowSums(Kx). # Means at x ("drop" to drop the matrix attributes) drop(W %*% Y) } ``` ### 1.3 R-SMD. ```{r} beta.rsmd.het= function (optbandwi, nobs, X, Z, Y, W){ #optbandwi: bandwidth #nobs: number of observations #X: the endogenous variables #Z: the exogenous variables #Y: outcome #W: the instrument Xsmo = mNW(x = Z, X=Z, Y = X, h =optbandwi) Ysmo = mNW(x = Z, X=Z, Y = Y, h =optbandwi) Xtil = X - Xsmo ytil = Y - Ysmo W = data.frame(W) b = dim(W)[2] k = matrix(1,nrow = nobs,ncol = nobs) for (i in 1:b) { kx1 = sapply(W[,i], function(Xi) exp(-(W[,i] - Xi)^2 /2)) k = k * kx1 } ktil = k - k[1,1] * diag(nrow = nobs) Xtil = as.matrix(data.frame(Xtil)) A = (1/nobs)*(1/(nobs))*t(Xtil)%*% ktil %*% Xtil C = (1/nobs)*(1/(nobs))*t(Xtil)%*% ktil %*% ytil beta =(1/A) %*% C residual = ytil -Xtil%*%beta ncolX = dim(Xtil)[2] mat1 = residual^2 %*% matrix(1,1,ncol = ncolX) B = t(( ktil %*% Xtil) * mat1) %*% ktil %*% Xtil #extract the diagonal varcov = solve(t(Xtil)%*% ktil %*% Xtil) %*% B %*% solve(t(Xtil)%*% ktil %*% Xtil) var=diag(varcov) se = sqrt(var) seMat = matrix(se, dim(Xtil)[2],1) tstat = beta/seMat output =data.frame(beta, seMat, tstat) #produce the beta (estimate), the seMat (standard error under heteroscedasticity), and the tstat (t-statistics) return (output) } ``` ## 2. example ```{r} mu=c(0,0) pi = 1 beta = 2 g = 3 alpha =4 scalepara = 8 sigma1=matrix(c(1,5/9,5/9,1),2,2) sigma2=matrix(c(1,4/9,4/9,1),2,2) nobs =200 set.seed(1) WandZ=mvrnorm(nobs,mu,sigma1) eandv=mvrnorm(nobs,mu,sigma2) X1 = WandZ[,1] X2 = WandZ[,2] Xvar = pi * X1 + alpha * (X1)^2 +X2 + alpha *(X2)^(3)+ eandv[,2] Xvar = scale(Xvar) Xvar = scalepara * Xvar y =beta * Xvar + g * X2 - g * (X2)^2 + eandv[,1] #optbandwi: bandwidth #nobs: number of observations #X: the endogenous variable. Xvar #Z: the exogenous variable. X2 #Y: outcome. y #W: the instrument. X1 maxbandwi = (1/nobs)^(0.2) # rule of thumb bandwidth bandch = maxbandwi print(bandch) beta.rsmd.het(optbandwi = bandch,nobs = nobs, X = Xvar, Z = X2, Y = y, W = X1) #RSMD under heteroscedasticity beta.gmm.het(nobs = nobs, X= Xvar, W = X1, Z = X2, Y = y) #GMM under heteroscedasticity beta.2sls.het(nobs = nobs, X = Xvar, Z = X2, Y = y, W = X1, withZ = 1) #2SLS under heteroscedasticity library(ivmodel) ivmodel(Y= y, D = Xvar, Z = X1, X=X2,heteroSE = TRUE) # #Y: outcome. D: endogenous variable. Z: the instrument. X: the exogenous variable (control) ```