#-------------------------------------------------------------------------- # QUNIONIZATION6.R # # Description: Analyzes public sector unionization data, complete code # # Author: Brian Krauth, Simon Fraser University # # Date: September 6, 2010 #-------------------------------------------------------------------------- # The "#" character is the "comment" indicator. R will ignore everything on a line after "#". # To run this file, follow these steps: # (1) Put the file along with the data files into a directory. # (2) Call the setwd() function to change the working directory to the directory containing this file. setwd("/documents and settings/brian/documents/my dropbox/teaching/econ_435/homeworks/supporting data files/qunionization") # (3) Execute the command line: source("qunionization6.r") # #-------------------------------------------------------------------------- # Initial setup #-------------------------------------------------------------------------- # Clear the memory remove(list=objects()) # Now we will define a few functions #-------------------------------------------------------------------------- # READYEAR function # # Purpose: Reads in and cleans a year of data. # # Usage: readyear(year) # # year The year in 4-digit format (eg., 2003) # #-------------------------------------------------------------------------- readyear <- function(year){ yr <- substring(year,3) #-------------------------------------------------------------------------- # Read in the data from the CSV files #-------------------------------------------------------------------------- # For unionization data file State U_2003.xls # (1) Delete rows 260-267 and 1-3. # (2) Reformat entire sheet as "GENERAL" # (3) Save as the CSV file State U_2003.csv # We use the function read.csv() to read in data from a CSV file. Comments: # (1) S-Plus doesn't have read.csv(), so you would need to use the read.table() function # (2) The row.names=NULL option tells R not to use the first variable to give names to the rows. # (3) The stringsAsFactors=FALSE tells R to read in string variables (e.g., state name) as strings. By default, R will # convert strings into what it calls "factors". unionization <- read.csv(paste("State U_",year,".csv",sep=""),row.names=NULL,stringsAsFactors=FALSE) # For population data file NST-EST2008-01.XLS: # (1) Delete rows 62-67 and 1-3. Note that row 1 is almost hidden! # (2) Reformat entire sheet EXCEPT header row as "GENERAL" # (3) Save as CSV file NST-EST2008-01.csv pop <- read.csv("NST-EST2008-01.csv",row.names=NULL,stringsAsFactors=FALSE) # For payroll data file 03stall.xls: # (1) Create complete variable names in row 4. # (2) Delete rows 1-3 # (3) Reformat entire sheet as "GENERAL" # (3) Save as CSV file 03stall.csv payroll <- read.csv(paste(yr,"stall.csv",sep=""),row.names=NULL,stringsAsFactors=FALSE) #-------------------------------------------------------------------------- # Clean the unionization data #-------------------------------------------------------------------------- # We only care about public-sector unionization, so drop all other sectors unionization <- unionization[toupper(unionization$Sector) == "PUBLIC",] # We don't want DC in there, so drop it unionization <- unionization[unionization$State != "D.C.",] # Alphabetize the data by state name unionization <- unionization[sort.list(unionization$State),] # Add in the state postal abbreviations. # These can be found in a predefined data set called state.abb. # They are listed in alphabetical order by state name (that's why we just alphabetized by state name) unionization$State.abb <- state.abb # The membership and coverage rates are reported in the data, but only to one decimal place # We can do better by calculating them from the original variables unionization$Coverage.Rate <- 100*unionization$Covered/unionization$Employment unionization$Membership.Rate <- 100*unionization$Members/unionization$Employment # Get rid of variables we don't need. Earlier I suggested doing this by setting the # value of each undesired variable to NULL. Instead what I do is specify the variables # I want to *keep*. This ends up working better when we try to combine data from # multiple years. unionization <- unionization[,c("State","State.abb","Membership.Rate","Coverage.Rate")] #-------------------------------------------------------------------------- # Clean the population data #-------------------------------------------------------------------------- # The population data set has multiple years of population estimates in a single data file. # So while we use a different file for each year of unionization and payroll data, here # we use the same file but a different variable name for a given year's population. # The population in 2003 (for exxample) is called X1.Jul.03 pop$Population <- pop[,paste("X1.Jul.",yr,sep="")] # State names in this data set start with a period (for example, ".Alabama") Get rid of the leading "." pop$State <- substring(pop$State,2) # We don't want DC in there, so drop it pop <- pop[pop$State %in% state.name,] # Alphabetize the data by state name pop <- pop[sort.list(pop$State),] # Add in the state postal abbreviations pop$State.abb <- state.abb # Get rid of any variables we don't need pop <- pop[,c("State.abb","Population")] #-------------------------------------------------------------------------- # Clean the payroll data #-------------------------------------------------------------------------- # We only care about total payroll, so drop all other functions payroll <- payroll[toupper(substr(payroll$Function,1,5))=="TOTAL",] # Drop the results for the U.S. as a whole payroll <- payroll[toupper(payroll$State.Name) != "UNITED STATES",] # Alphabetize the data by state name payroll <- payroll[sort.list(payroll$State.Name),] # Add in the state postal abbreviations payroll$State.abb <- state.abb # Get rid of any variables we don't need payroll <- payroll[,c("State.abb","Full.Time.Employees","FTE.Employment","Total.March.Payroll")] #-------------------------------------------------------------------------- # Merge the three data sets into a single data set #-------------------------------------------------------------------------- # We merge by state postal abbreviation dat <- merge(merge(unionization,pop,by="State.abb"),payroll,by="State.abb") #-------------------------------------------------------------------------- # Clean the combined data set #-------------------------------------------------------------------------- # These are all variables used in the regressions dat$FTPer100K <- 100000*dat$Full.Time.Employees/dat$Population dat$FTEPer100K <- 100000*dat$FTE.Employment/dat$Population dat$PerCapitaPayroll <- dat$Total.March.Payroll/dat$Population dat$PerFTEPayroll <- dat$Total.March.Payroll/dat$FTE.Employment dat$Population.In.Millions <- dat$Population/1000000 dat$Log.Population <- log(dat$Population) # When we create a panel data set we will need the year dat$Year <- year dat$Full.Time.Employees <- dat$Population <- dat$FTE.Employment <- dat$Total.March.Payroll <- dat$FTE.Employment <- NULL # The last line of a function tells you what value to return dat } table1 <- function(dat) { # The cat() function prints out a string of text. "\n" means carriage return. cat("\n---------------------------------------------------\nTABLE 1: \n---------------------------------------------------\n") cat("\nTABLE 1, MEAN, MEDIAN, MIN, MAX: \n \n") print(summary(dat)) cat("\nTABLE 1, STANDARD DEVIATION: \n \n") print(apply(dat,2,function(x) sd(x))) } table2 <- function(dat){ cat("\n---------------------------------------------------\nTABLE 2: \n---------------------------------------------------\n") cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (1): \n \n") # This is the code needed to output regression results in a nice-looking manner. # The lm() function actually estimates the regression # The summary() function, applied to the results of lm(), calculates the # standard errors, t-statistics, etc. # The coef() function extracts just the coefficients, standard errors, # and t-statistics # The print() function prints the results print(summary(lm(FTPer100K ~ Membership.Rate,data=dat))) cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (2): \n \n") print(summary(lm(FTPer100K ~ Coverage.Rate,data=dat))) cat("\nTABLE 2, FULL-TIME EMPLOYMENT, SPECIFICATION (3): \n \n") print(summary(lm(FTPer100K ~ Membership.Rate + Log.Population,data=dat))) cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (1): \n \n") print(summary(lm(FTEPer100K ~ Membership.Rate,data=dat))) cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (2): \n \n") print(summary(lm(FTEPer100K ~ Coverage.Rate,data=dat))) cat("\nTABLE 2, FTE EMPLOYMENT, SPECIFICATION (3): \n \n") print(summary(lm(FTEPer100K ~ Membership.Rate + Log.Population,data=dat))) } table3 <- function(dat){ cat("\n---------------------------------------------------\nTABLE 3: \n---------------------------------------------------\n") cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (1): \n \n") print(summary(lm(PerCapitaPayroll ~ Membership.Rate,data=dat))) cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (2): \n \n") print(summary(lm(PerCapitaPayroll ~ Coverage.Rate,data=dat))) cat("\nTABLE 3, PER-CAPITA PAYROLL, SPECIFICATION (3): \n \n") print(summary(lm(PerCapitaPayroll ~ Membership.Rate + Log.Population,data=dat))) cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (1): \n \n") print(summary(lm(PerFTEPayroll ~ Membership.Rate,data=dat))) cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (2): \n \n") print(summary(lm(PerFTEPayroll ~ Coverage.Rate,data=dat))) cat("\nTABLE 3, PER-FTE PAYROLL, SPECIFICATION (3): \n \n") print(summary(lm(PerFTEPayroll ~ Membership.Rate + Log.Population,data=dat))) } table4 <- function(paneldat) { # To estimate a panel data regression with fixed effects we use the plm function from the plm library library("plm") cat("\n---------------------------------------------------\nTABLE 4: \n---------------------------------------------------\n") cat("\nTABLE 4, FULL-TIME EMPLOYMENT: \n \n") # Note that the effect= option is set to "twoways". This means both state and year effects. # The default option is state effects but not year effects. print(coef(summary(plm(FTPer100K ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year"))))) cat("\nTABLE 4, FTE EMPLOYMENT: \n \n") print(coef(summary(plm(FTEPer100K ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year"))))) cat("\nTABLE 4, PER-CAPITA PAYROLL: \n \n") print(coef(summary(plm(PerCapitaPayroll ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year"))))) cat("\nTABLE 4, PER-FTE PAYROLL: \n \n") print(coef(summary(plm(PerFTEPayroll ~ Membership.Rate,data=paneldat,effect="twoways",index=c("State.abb","Year"))))) } table5 <- function(dat){ # Link in the Hoxby data (I'm assuming it's stored in the working directory) dat <- merge(dat,read.csv("hoxby.csv"),by="State.abb") cat("\n---------------------------------------------------\nTABLE 5: \n---------------------------------------------------\n") cat("Number of states with right to collective bargaining: ",sum(dat$Duty),"\n") cat("Number of states with agency shops: ",sum(dat$Agency),"\n") cat("Number of states with closed shops: ",sum(dat$Closed),"\n") cat("\nTABLE 5, FIRST-STAGE REGRESSION (AND F STATISTIC): \n \n") # First stage regression tmp <- lm(Membership.Rate ~ Log.Population + Duty + Agency + Closed, data=dat) print(coef(summary(tmp))) # Next we get the F-statistic on the excluded instruments # To test a general linear hypothesis we use the linearHypothesis function in the CAR library library("car") print(linearHypothesis(tmp,rbind(c(0,0,1,0,0),c(0,0,0,1,0),c(0,0,0,0,1)),c(0,0,0))) # To estimate a regression using instrumental variables we use the ivreg function in the AER library library("AER") cat("\nTABLE 5, FULL-TIME EMPLOYMENT: \n \n") print(coef(summary(ivreg(FTPer100K ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat)))) cat("\nTABLE 5, FTE EMPLOYMENT: \n \n") print(coef(summary(ivreg(FTEPer100K ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat)))) cat("\nTABLE 5, PER-CAPITA PAYROLL: \n \n") print(coef(summary(ivreg(PerCapitaPayroll ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat)))) cat("\nTABLE 5, PER-FTE PAYROLL: \n \n") print(coef(summary(ivreg(PerFTEPayroll ~ Membership.Rate + Log.Population | Log.Population + Duty + Agency + Closed,data=dat)))) } figure1 <- function(dat){ # -------------------FIGURE 1--------------------- # Simple version #hist(dat$Membership.Rate) # A nicer version hist(dat$Membership.Rate,xlab="Membership Rate",main="") # An alternative version... #plot(density(dat$Membership.Rate),xlab="Membership Rate",main="") # The savePlot command exports the plot to a file. savePlot("fig1",type="jpg") } figure2 <- function(dat) { # -------------------FIGURE 2--------------------- # Simple version #plot(dat$Membership.Rate,dat$PerCapitaPayroll) # A nicer graph that uses the state abbreviations instead of dots # In the plot() function, the type="n" option means to set up the graph but NOT plot the points plot(dat$Membership.Rate,dat$PerCapitaPayroll,xlab="Membership Rate",ylab="Per Capita Payroll",type="n") # The text() function allows us to plot the state abbreviations right where that state's point would be text(dat$Membership.Rate,dat$PerCapitaPayroll,labels=dat$State.abb) savePlot("fig2",type="jpg") } figure3 <- function(dat) { # -------------------FIGURE 3--------------------- # The function supsmu() estimates a nonparametric regression plot(supsmu(dat$Membership.Rate,dat$PerCapitaPayroll),type="l",xlab="Membership Rate",ylab="Per Capita Payroll",main="Linear and nonparametric fits") # The function abline() plots a line given its slope and intercept abline(reg=lm(PerCapitaPayroll ~ Membership.Rate,data=dat),col=2,lty=2) savePlot("fig3",type="jpg") } #-------------------------------------------------------------------------- # BEGIN RUN CODE #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- # Read in each year of data #-------------------------------------------------------------------------- dat00 <- readyear(2000) dat01 <- readyear(2001) dat02 <- readyear(2002) dat03 <- readyear(2003) dat04 <- readyear(2004) dat05 <- readyear(2005) dat06 <- readyear(2006) dat07 <- readyear(2007) dat08 <- readyear(2008) # Create the panel data set paneldat <- rbind(dat00,dat01,dat02,dat03,dat04,dat05,dat06,dat07,dat08) #-------------------------------------------------------------------------- # Make the tables #-------------------------------------------------------------------------- # The sink() function below sends a copy of all subsequent output to a text file sink(file="tables.txt",split=T) table1(dat03) table2(dat03) table3(dat03) table4(paneldat) table5(dat03) # Invoking sink() with no arguments closes the file opened earlier. sink() #-------------------------------------------------------------------------- # Make the figures #-------------------------------------------------------------------------- figure1(dat03) figure2(dat03) figure3(dat03)