pacman::p_load(tidyverse, RStata, readxl, lmtest, sandwich) setwd("/Users/shanchaowang/Desktop/Submission Data/") # set this to your local directory where data is stored # output names out_dist = "Distribution.Rdata" ##### load data ##### # R & D investment RD_INSTEPP = read_xlsx("Wang et al. _R_D lags_Regression-Data (2023-06-02).xlsx", sheet = "2. Ag R&D Spending") %>% as.data.frame() # Weather index WI = read_xlsx("Wang et al. _R_D lags_Regression-Data (2023-06-02).xlsx", sheet = "4. Weather Index") %>% rename(YEAR = Year, INDEX = `Weather Index`) %>% select(YEAR, INDEX) # MFP level instead of log MFP = read_xlsx("Wang et al. _R_D lags_Regression-Data (2023-06-02).xlsx", sheet = "1. MFP") ##### distributions ##### # each take data, parameters, base year, whether we normalize it # output: year, investment flow, knowledge stock, distribution type # starting from current year #### Gamma distribution gweights = function(lag, g, delta, lambda){ bweights = rep(NA, lag + 1) bweights[1:(g+1)] = 0 k = (g + 1):lag bweights[(g + 2): length(bweights)] = sapply(k, function(x) (x - g + 1)^(delta/(1-delta))*lambda^(x - g)) bweights = bweights/sum(bweights) return(bweights) } stock_gamma = function(data, lag, delta, lambda, gestation, norm = F, base = 1940){ # if gestation = 1, then first two years are assigned with 0 weights # remove NA data, data is in the format that the first col is year and the second col is number data = data[which(!is.na(data[,2])),] data = data %>% mutate(Stock = NA) # calculate lag weights bweights = gweights(lag = lag, g = gestation, delta = delta, lambda = lambda) # use the weight to calculate knowledge stock for(i in (length(bweights): nrow(data))){ data$Stock[i] = sum(bweights * data[i:(i-lag),2]) } data = data %>% mutate(Type = "Gamma") if(norm == T){ data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 # rebase to make first observation be 100 } colnames(data) = c("Year", "Value", "Stock", "Type") return(data) } #### Trarpezoid # a = gestation end, b = increase end, c = constant end, d = decrease end trapezoid = function(x, a, b, c, d){ if(x < a | x > d){ res = 0 }else if(x >= a & x < b){ res = 2/(d + c - a - b)*((x-a)/(b-a)) }else if(x >= b & x < c){ res = 2/(d + c - a - b) }else{ res = 2/(d + c - a - b)*((d-x)/(d-c)) } return(res) } stock_trap = function(data, lag, gestation, increase, constant, decrease, norm = F, base = 1940){ # remove NA data, data is in the format that the first col is year and the second col is investment data = data[which(!is.na(data[,2])),] data = data %>% mutate(Stock = NA) # calculate lag weights bweights = rep(NA, lag + 1) bweights = sapply(0:lag, trapezoid, a = gestation, b = increase, c = constant, d = decrease) bweights = bweights/sum(bweights) # use the weight to calculate knowledge stock for(i in (length(bweights): nrow(data))){ data$Stock[i] = sum(bweights * data[i:(i-lag),2]) } data = data %>% mutate(Type = "Trap") if(norm == T){ data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 } colnames(data) = c("Year", "Value", "Stock", "Type") return(data) } #### Geometric distribution stock_geom = function(data, lag, depreciation, g_period=10, gestation, norm = F, base = 1940){ # remove NA data, data is in the format that the first col is year and the second col is number data = data[which(!is.na(data[,2])),] data = data %>% mutate(Stock = NA) # calculate lag weights bweights = sapply((gestation+1):lag, function(x) (1-depreciation)^x) bweights = c(rep(0, gestation+1), bweights/sum(bweights)) # use the weight to calculate knowledge stock for(i in (length(bweights): nrow(data))){ data$Stock[i] = sum(bweights * data[i:(i-lag),2]) } data = data %>% mutate(Type = paste0("Geom",depreciation*100)) if(norm == T){ data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 } colnames(data) = c("Year", "Value", "Stock", "Type") return(data) } #### Bloom models # The first two assume infinite stream of investment # Starting from first data point, use the ex-post 10 years growth rate to recover benchmark stock # bloom.expost = function(data, post=10, norm = F, base = 1940){ # # remove NA data, data is in the format that the first col is year and the second col is number # data = data[which(!is.na(data[,2])),] # data = data %>% mutate(Stock = NA) # # calculate growth rates # g_rate = (lead(data[,2]) - data[,2])/data[,2] # g = mean(g_rate[1:post], na.rm = T) # # benchmark stock # data$Stock[1] = data$`Public AgRD`[1]/g # # fill out stock # for (i in 2:nrow(data)){ # data$Stock[i] = data$Stock[i-1] + data[i,2] # } # data = data %>% mutate(Type = paste0("Expost",post)) # # rebase if norm == T # if(norm == T){ # data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 # } # colnames(data) = c("Year", "Value", "Stock", "Type") # return(data) # } # # # starting from base year 1940, and use real average growth rate up to 1940 to recover benchmark knowledge stock # bloom.real = function(data, norm = F, base = 1940){ # # remove NA data, data is in the format that the first col is year and the second col is number # data = data[which(!is.na(data[,2])),] # data = data %>% mutate(Stock = NA) # # calculate growth rates # g = (lead(data[,2]) - data[,2])/data[,2] # g = mean(g[1:which(data$Year == base)], na.rm = T) # # benchmark stock # data$Stock[which(data$Year == base)] = data$`Public AgRD`[which(data$Year == base)]/g # # fill out stock # for (i in (which(data$Year == base)+1):nrow(data)){ # data$Stock[i] = data$Stock[i-1] + data[i,2] # } # data = data %>% mutate(Type = "Real") # # rebase if norm == T # if(norm == T){ # data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 # } # colnames(data) = c("Year", "Value", "Stock", "Type") # return(data) # } # linearly backcast stock to 1862 when USDA established. 1861 investment is 0 bloom.linear = function(data, norm = F, base = 1940){ data = data[which(!is.na(data[,2])),] colnames(data) = c('Year', 'Value') temp_data = data.frame(Year = 1862:(data$Year[1]-1)) temp_data$Value = sapply(temp_data$Year, function(x) data$Value[1]/(data$Year[1]-1861)*x -(1861*data$Value[1])/(data$Year[1]-1861)) data = rbind(temp_data, data) data = data %>% mutate(Stock = cumsum(Value), Type = "Linear") # rebase if norm == T if(norm == T){ data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 } return(data) } # #### flat backcast stock to 1862 when USDA established. Using the last available data point # bloom.flat = function(data, norm = F, base = 1940){ # data = data[which(!is.na(data[,2])),] # colnames(data) = c('Year', 'Value') # temp_data = data.frame(Year = 1862:(data$Year[1]-1), Value = data$Value[1]) # data = rbind(temp_data, data) # data = data %>% mutate(Stock = cumsum(Value), Type = "Flat") # # rebase if norm == T # if(norm == T){ # data$Stock = data$Stock/data$Stock[which(data$Year == base)]*100 # } # return(data) # } ##### constructing distributions ##### # Initialize parameters gestation = 1 lag_years = 50 # gamma delta = seq(0.6, 0.95, by = 0.05) lambda = seq(0.6, 0.95, by = 0.05) # trapezoid increase_trap = 9 constant_trap = 15 decrease_trap = 35 # construct weights # Gamma distribution weights weights_data = apply(expand.grid(delta, lambda), 1, function(x) gweights(lag = lag_years, gestation, x[1], x[2])) %>% as.data.frame() colnames(weights_data) = apply(expand.grid(delta, lambda), 1, function(x) paste0("Delta=",x[1],", Lambda=", x[2])) # Trapezoid distribution weights weights_data = weights_data %>% mutate(Trapezoid = sapply(0:lag_years, trapezoid, a = gestation, b = 9, c = 15, d = 35), Lags = 0:lag_years) # Geometric distribution weights geo_temp = sapply(weights_data$Lags[(gestation+2):nrow(weights_data)], function(x) {(1-0.1)^(x)}) weights_data$Geom10 = c(rep(0,gestation+1), geo_temp/sum(geo_temp)) geo_temp = sapply(weights_data$Lags[(gestation+2):nrow(weights_data)], function(x) {(1-0.15)^(x)}) weights_data$Geom15 = c(rep(0,gestation+1), geo_temp/sum(geo_temp)) rm(geo_temp) # add bloom weights weights_data = weights_data %>% mutate(BloomExp = 1, BloomReal = 1, BloomLinear = 1, BloomFlat = 1) save(weights_data, file = "weights.RData") # construct knowledge stock # prepare the data frame for regression except for stock mydata = merge(MFP, WI, by = "YEAR") # normalize weather index mydata$INDEX = (mydata$INDEX - mean(mydata$INDEX))/sd(mydata$INDEX) # create TREND variable mydata = mydata %>% mutate(TREND = YEAR - mydata$YEAR[1] + 1) # gamma temp = apply(expand.grid(delta, lambda), 1, function(x) {res = stock_gamma(RD_INSTEPP %>% select(Year, `Public AgRD`), lag = lag_years, gestation = gestation, delta = x[1], lambda = x[2]) %>% filter(!is.na(Stock)) %>% select(Year, Stock) colnames(res) = c("Year", paste0("Delta=",x[1],", Lambda=", x[2])) return(res)}) temp = temp %>% reduce(full_join, by = "Year") mydata = merge(mydata, temp, by.x = "YEAR", by.y = "Year") # Trapezoid temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% stock_trap(., lag = lag_years, gestation = gestation, increase = 9, constant = 15, decrease = 35) %>% filter(!is.na(Stock)) %>% select(Year, Stock) colnames(temp) = c("YEAR", "Trapezoid") mydata = merge(mydata, temp, by = "YEAR", all.x = T) # Geom10 and Geom15 temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% stock_geom(., lag = lag_years, depreciation = 0.10, g_period = 10, gestation = gestation) %>% filter(!is.na(Stock)) %>% select(Year, Stock) colnames(temp) = c("YEAR", "Geom10") mydata = merge(mydata, temp, by = "YEAR", all.x = T) temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% stock_geom(., lag = lag_years, depreciation = 0.15, g_period = 10, gestation = gestation) %>% filter(!is.na(Stock)) %>% select(Year, Stock) colnames(temp) = c("YEAR", "Geom15") mydata = merge(mydata, temp, by = "YEAR", all.x = T) # Bloom # temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% # bloom.expost() %>% # filter(!is.na(Stock)) %>% select(Year, Stock) # colnames(temp) = c("YEAR", "BLOOMEXPO") # mydata = merge(mydata, temp, by = "YEAR", all.x = T) # # temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% # bloom.real() %>% # filter(!is.na(Stock)) %>% select(Year, Stock) # colnames(temp) = c("YEAR", "BLOOMREAL") # mydata = merge(mydata, temp, by = "YEAR", all.x = T) temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% bloom.linear() %>% filter(!is.na(Stock)) %>% select(Year, Stock) colnames(temp) = c("YEAR", "BLOOMLINEAR") mydata = merge(mydata, temp, by = "YEAR", all.x = T) # temp = RD_INSTEPP %>% select(Year, `Public AgRD`) %>% # bloom.flat() %>% # filter(!is.na(Stock)) %>% select(Year, Stock) # colnames(temp) = c("YEAR", "BLOOMFLAT") # mydata = merge(mydata, temp, by = "YEAR", all.x = T) rm(temp) save(mydata, file = paste0(out_dist))