#--------------------------------------------------------------------------
# QUNIONIZATION6.R
#
# Description: Analyzes public sector unionization data, complete code
#
# Author: Brian Krauth, Simon Fraser University
#
# Date: September 6, 2010
#--------------------------------------------------------------------------
# The "#" character is the "comment" indicator.  R will ignore everything on a line after "#".

# To run this file, follow these steps:
#	(1) Put the file along with the data files into a directory.
# 	(2) Call the setwd() function to change the working directory to the directory containing this file.
setwd("/documents and settings/brian/documents/my dropbox/teaching/econ_435/homeworks/supporting data files/qunionization")
#   (3) Execute the command line: source("qunionization6.r")
#
#--------------------------------------------------------------------------
# Initial setup
#--------------------------------------------------------------------------
# Clear the memory
remove(list=objects())
# Now we will define a few functions

#--------------------------------------------------------------------------
# READYEAR function
#
# Purpose: Reads in and cleans a year of data.
#
# Usage: readyear(year)
#
#		year 	The year in 4-digit format (eg., 2003)
#
#--------------------------------------------------------------------------
readyear <- function(year){
	yr <- substring(year,3)
	#--------------------------------------------------------------------------
	# Read in the data from the CSV files
	#--------------------------------------------------------------------------
	# For unionization data file State U_2003.xls
	#	(1) Delete rows 260-267 and 1-3.
	#	(2) Reformat entire sheet as "GENERAL" 
	#	(3) Save as the CSV file State U_2003.csv
	# We use the function read.csv() to read in data from a CSV file.  Comments:
	# 	(1) S-Plus doesn't have read.csv(), so you would need to use the read.table() function
	#	(2) The row.names=NULL option tells R not to use the first variable to give names to the rows.
	#	(3) The stringsAsFactors=FALSE tells R to read in string variables (e.g., state name) as strings.  By default, R will
	#       convert strings into what it calls "factors".
	unionization <- read.csv(paste("State U_",year,".csv",sep=""),row.names=NULL,stringsAsFactors=FALSE)
	# For population data file NST-EST2008-01.XLS:
	#	(1) Delete rows 62-67 and 1-3.  Note that row 1 is almost hidden!
	#	(2) Reformat entire sheet EXCEPT header row as "GENERAL"
	#	(3) Save as CSV file NST-EST2008-01.csv
	pop <- read.csv("NST-EST2008-01.csv",row.names=NULL,stringsAsFactors=FALSE) 
	# For payroll data file 03stall.xls:
	#	(1) Create complete variable names in row 4.  
	#   (2) Delete rows 1-3
	#	(3) Reformat entire sheet as "GENERAL"
	#	(3) Save as CSV file 03stall.csv
	payroll <- read.csv(paste(yr,"stall.csv",sep=""),row.names=NULL,stringsAsFactors=FALSE)	
	#--------------------------------------------------------------------------
	# Clean the unionization data
	#--------------------------------------------------------------------------
	# We only care about public-sector unionization, so drop all other sectors
	unionization <- unionization[toupper(unionization$Sector) == "PUBLIC",]
	# We don't want DC in there, so drop it
	unionization <- unionization[unionization$State != "D.C.",]
	# Alphabetize the data by state name
	unionization <- unionization[sort.list(unionization$State),]
	# Add in the state postal abbreviations.  
	# These can be found in a predefined data set called state.abb.  
	# They are listed in alphabetical order by state name (that's why we just alphabetized by state name)
	unionization$State.abb <- state.abb
	# The membership and coverage rates are reported in the data, but only to one decimal place
	# We can do better by calculating them from the original variables
	unionization$Coverage.Rate <- 100*unionization$Covered/unionization$Employment
	unionization$Membership.Rate <- 100*unionization$Members/unionization$Employment
	# Get rid of variables we don't need.  Earlier I suggested doing this by setting the 
	# value of each undesired variable to NULL.  Instead what I do is specify the variables
	# I want to *keep*.  This ends up working better when we try to combine data from 
	# multiple years.
	unionization <- unionization[,c("State","State.abb","Membership.Rate","Coverage.Rate")]
	#--------------------------------------------------------------------------
	# Clean the population data
	#--------------------------------------------------------------------------
	# The population data set has multiple years of population estimates in a single data file.
	# So while we use a different file for each year of unionization and payroll data, here
	# we use the same file but a different variable name for a given year's population.
	# The population in 2003 (for exxample) is called X1.Jul.03
	pop$Population <- pop[,paste("X1.Jul.",yr,sep="")]
	# State names in this data set start with a period (for example, ".Alabama")  Get rid of the leading "."
	pop$State <- substring(pop$State,2)
	# We don't want DC in there, so drop it
	pop <- pop[pop$State %in% state.name,]
	# Alphabetize the data by state name
	pop <- pop[sort.list(pop$State),]
	# Add in the state postal abbreviations
	pop$State.abb <- state.abb
	# Get rid of any variables we don't need
	pop <- pop[,c("State.abb","Population")]
	#--------------------------------------------------------------------------
	# Clean the payroll data
	#--------------------------------------------------------------------------
	# We only care about total payroll, so drop all other functions
	payroll <- payroll[toupper(substr(payroll$Function,1,5))=="TOTAL",]
	# Drop the results for the U.S. as a whole
	payroll <- payroll[toupper(payroll$State.Name) != "UNITED STATES",]
	# Alphabetize the data by state name
	payroll <- payroll[sort.list(payroll$State.Name),]
	# Add in the state postal abbreviations
	payroll$State.abb <- state.abb
	# Get rid of any variables we don't need
	payroll <- payroll[,c("State.abb","Full.Time.Employees","FTE.Employment","Total.March.Payroll")]
	#--------------------------------------------------------------------------
	# Merge the three data sets into a single data set
	#--------------------------------------------------------------------------
	# We merge by state postal abbreviation
	dat <- merge(merge(unionization,pop,by="State.abb"),payroll,by="State.abb")
	#--------------------------------------------------------------------------
	# Clean the combined data set
	#--------------------------------------------------------------------------
	# These are all variables used in the regressions
	dat$FTPer100K <- 100000*dat$Full.Time.Employees/dat$Population
	dat$FTEPer100K <- 100000*dat$FTE.Employment/dat$Population
	dat$PerCapitaPayroll <- dat$Total.March.Payroll/dat$Population
	dat$PerFTEPayroll <- dat$Total.March.Payroll/dat$FTE.Employment
	dat$Population.In.Millions <- dat$Population/1000000
	dat$Log.Population <- log(dat$Population)
	# When we create a panel data set we will need the year
	dat$Year <- year
	dat$Full.Time.Employees <- dat$Population <- dat$FTE.Employment <- dat$Total.March.Payroll <- dat$FTE.Employment <- NULL
	# The last line of a function tells you what value to return
	dat
}

table1 <- function(dat) {
	# The cat() function prints out a string of text.  "\n" means carriage return.
	cat("\n---------------------------------------------------\nTABLE 1: \n---------------------------------------------------\n")
	cat("\nTABLE 1, MEAN, MEDIAN, MIN, MAX: \n \n")
	print(summary(dat))
	cat("\nTABLE 1, STANDARD DEVIATION: \n \n")	
	print(apply(dat,2,function(x) sd(x)))
}

table2 <- function(dat){
	cat("\n---------------------------------------------------\nTABLE 2: \n---------------------------------------------------\n")
	cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (1): \n \n")
	# This is the code needed to output regression results in a nice-looking manner.
	#    The lm() function actually estimates the regression
	#    The summary() function, applied to the results of lm(), calculates the
	#        standard errors, t-statistics, etc.
	#    The coef() function extracts just the coefficients, standard errors, 
	#        and t-statistics
	#    The print() function prints the results
	print(summary(lm(FTPer100K ~ Membership.Rate,data=dat)))
	cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (2): \n \n")
	print(summary(lm(FTPer100K ~ Coverage.Rate,data=dat)))
	cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (3): \n \n")
	print(summary(lm(FTPer100K ~ Membership.Rate + Log.Population,data=dat)))
	cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (1): \n \n")
	print(summary(lm(FTEPer100K ~ Membership.Rate,data=dat)))
	cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (2): \n \n")
	print(summary(lm(FTEPer100K ~ Coverage.Rate,data=dat)))
	cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (3): \n \n")
	print(summary(lm(FTEPer100K ~ Membership.Rate + Log.Population,data=dat)))
}

table3 <- function(dat){
	cat("\n---------------------------------------------------\nTABLE 3: \n---------------------------------------------------\n")
	cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (1): \n \n")
	print(summary(lm(PerCapitaPayroll ~ Membership.Rate,data=dat)))
	cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (2): \n \n")
	print(summary(lm(PerCapitaPayroll ~ Coverage.Rate,data=dat)))
	cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (3): \n \n")
	print(summary(lm(PerCapitaPayroll ~ Membership.Rate + Log.Population,data=dat)))
	cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (1): \n \n")
	print(summary(lm(PerFTEPayroll ~ Membership.Rate,data=dat)))
	cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (2): \n \n")
	print(summary(lm(PerFTEPayroll ~ Coverage.Rate,data=dat)))
	cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (3): \n \n")
	print(summary(lm(PerFTEPayroll ~ Membership.Rate + Log.Population,data=dat)))
}

table4 <- function(paneldat) {
	# To estimate a panel data regression with fixed effects we use the plm function from the plm library
	library("plm")
	cat("\n---------------------------------------------------\nTABLE 4: \n---------------------------------------------------\n")
	cat("\nTABLE 4, FULL-TIME EMPLOYMENT: \n \n")
	# Note that the effect= option is set to "twoways". This means both state and year effects.
	# The default option is state effects but not year effects.
	print(coef(summary(plm(FTPer100K ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year")))))
	cat("\nTABLE 4, FTE EMPLOYMENT: \n \n")
	print(coef(summary(plm(FTEPer100K ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year")))))
	cat("\nTABLE 4, PER-CAPITA PAYROLL: \n \n")
	print(coef(summary(plm(PerCapitaPayroll  ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year")))))
	cat("\nTABLE 4, PER-FTE PAYROLL: \n \n")
	print(coef(summary(plm(PerFTEPayroll ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year")))))
}

table5 <- function(dat){
	# Link in the Hoxby data (I'm assuming it's stored in the working directory)
	dat <- merge(dat,read.csv("hoxby.csv"),by="State.abb")
	cat("\n---------------------------------------------------\nTABLE 5: \n---------------------------------------------------\n")
	cat("Number of states with right to collective bargaining: ",sum(dat$Duty),"\n") 
	cat("Number of states with agency shops: ",sum(dat$Agency),"\n") 
	cat("Number of states with closed shops: ",sum(dat$Closed),"\n") 
	cat("\nTABLE 5, FIRST-STAGE REGRESSION (AND F STATISTIC): \n \n")	
	# First stage regression
	tmp <- lm(Membership.Rate ~ Log.Population + Duty + Agency + Closed, data=dat)
	print(coef(summary(tmp)))
	# Next we get the F-statistic on the excluded instruments
	# To test a general linear hypothesis we use the linearHypothesis function in the CAR library
	library("car")
	print(linearHypothesis(tmp,rbind(c(0,0,1,0,0),c(0,0,0,1,0),c(0,0,0,0,1)),c(0,0,0)))
	# To estimate a regression using instrumental variables we use the ivreg function in the AER library
	library("AER")
	cat("\nTABLE 5, FULL-TIME EMPLOYMENT: \n \n")
	print(coef(summary(ivreg(FTPer100K ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat))))
	cat("\nTABLE 5, FTE EMPLOYMENT: \n \n")
	print(coef(summary(ivreg(FTEPer100K ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat))))
	cat("\nTABLE 5, PER-CAPITA PAYROLL: \n \n")
	print(coef(summary(ivreg(PerCapitaPayroll ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat))))
	cat("\nTABLE 5, PER-FTE PAYROLL: \n \n")
	print(coef(summary(ivreg(PerFTEPayroll ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat))))
}

figure1 <- function(dat){
	# -------------------FIGURE 1---------------------
	# Simple version
	#hist(dat$Membership.Rate)
	# A nicer version 
	hist(dat$Membership.Rate,xlab="Membership Rate",main="")
	# An alternative version...
	#plot(density(dat$Membership.Rate),xlab="Membership Rate",main="")
	# The savePlot command exports the plot to a file.  
	savePlot("fig1",type="jpg")
}

figure2 <- function(dat) {
	# -------------------FIGURE 2---------------------
	# Simple version
	#plot(dat$Membership.Rate,dat$PerCapitaPayroll)
	# A nicer graph that uses the state abbreviations instead of dots
	# In the plot() function, the type="n" option means to set up the graph but NOT plot the points
	plot(dat$Membership.Rate,dat$PerCapitaPayroll,xlab="Membership Rate",ylab="Per Capita Payroll",type="n")
	# The text() function allows us to plot the state abbreviations right where that state's point would be
	text(dat$Membership.Rate,dat$PerCapitaPayroll,labels=dat$State.abb)
	savePlot("fig2",type="jpg")
}

figure3 <- function(dat) {
	# -------------------FIGURE 3---------------------
	# The function supsmu() estimates a nonparametric regression
	plot(supsmu(dat$Membership.Rate,dat$PerCapitaPayroll),type="l",xlab="Membership Rate",ylab="Per Capita Payroll",main="Linear and nonparametric fits")
	# The function abline() plots a line given its slope and intercept 
	abline(reg=lm(PerCapitaPayroll ~ Membership.Rate,data=dat),col=2,lty=2)
	savePlot("fig3",type="jpg")
}

#--------------------------------------------------------------------------
# BEGIN RUN CODE
#--------------------------------------------------------------------------
#--------------------------------------------------------------------------
# Read in each year of data
#--------------------------------------------------------------------------
dat00 <- readyear(2000)
dat01 <- readyear(2001)
dat02 <- readyear(2002)
dat03 <- readyear(2003)
dat04 <- readyear(2004)
dat05 <- readyear(2005)
dat06 <- readyear(2006)
dat07 <- readyear(2007)
dat08 <- readyear(2008)
# Create the panel data set  
paneldat <- rbind(dat00,dat01,dat02,dat03,dat04,dat05,dat06,dat07,dat08)
#--------------------------------------------------------------------------
# Make the tables
#--------------------------------------------------------------------------
# The sink() function below sends a copy of all subsequent output to a text file
sink(file="tables.txt",split=T)
table1(dat03)
table2(dat03)
table3(dat03)
table4(paneldat)
table5(dat03)
# Invoking sink() with no arguments closes the file opened earlier.
sink()
#--------------------------------------------------------------------------
# Make the figures
#--------------------------------------------------------------------------
figure1(dat03)
figure2(dat03)
figure3(dat03)