setwd("/Users/shanchaowang/Desktop/Submission Data/") # set this to your local directory pacman::p_load(tidyverse, vroom, readxl) # Proxy of weather # Proxy = read_xlsx("Wang et al. _R&D lags_Regression-Data (2023-06-02).xlsx", sheet = '3. Yield Data') %>% mutate_at(c('YIELD', 'AREA', 'VOP', 'PRODUCTION'), as.numeric) # Corn need to exterpolate to 1940, has data after 1948 temp = Proxy %>% filter(COMMODITY == "CORN") %>% mutate plot(temp$YEAR, temp$VOP) # looks like exponential plot(temp$YEAR, log(temp$VOP)) # after transformation looks okay reg = lm(log(VOP) ~ YEAR + I(YEAR^2) + PRODUCTION, data = temp) # improved a little bit upon not includes PRODCUTION. Based on adjusted R^2 temp$PREDICTION = exp(predict(reg, temp)) Proxy[which(Proxy$COMMODITY == "CORN" & Proxy$YEAR < 1948), "VOP"] = temp$PREDICTION[which(temp$YEAR < 1948)] rm(temp, reg) # create balanced panel (can start from 1929, where SORGHUM has complete data) Proxy = Proxy %>% filter(YEAR %in% 1940:2010) agVOP = aggregate(VOP ~ YEAR, Proxy, sum) # create weight Proxy$WEIGHT = NA for (i in 1:nrow(Proxy)) { Proxy$WEIGHT[i] = Proxy$VOP[i]/agVOP$VOP[which(agVOP$YEAR == Proxy$YEAR[i])] } # double check temp = aggregate(WEIGHT ~ YEAR, Proxy, sum) sum(round(temp$WEIGHT) == 1) rm(temp) ########################### # run model for each crop # fitcrop = function(data){ temp = data %>% mutate(TREND = YEAR - min(data$YEAR) + 1) reg = lm(YIELD ~ TREND + I(TREND^2), data = temp) data$YIELD_EST = fitted(reg, temp) return(data) } fitcrop_demean = function(data){ data = data %>% mutate(TREND = YEAR - min(data$YEAR) + 1, YIELD = YIELD - mean(YIELD)) reg = lm(YIELD ~ TREND + I(TREND^2), data = data) data$YIELD_EST = fitted(reg, data) return(data) } fitcrop_std = function(data){ data = data %>% mutate(TREND = YEAR - min(data$YEAR) + 1, YIELD = (YIELD - mean(YIELD))/sd(YIELD)) reg = lm(YIELD ~ TREND + I(TREND^2), data = data) data$YIELD_EST = fitted(reg, data) return(data) } res = Proxy %>% group_by(COMMODITY) %>% do(fitcrop(.)) %>% as.data.frame() res_demean = Proxy %>% group_by(COMMODITY) %>% do(fitcrop_demean(.)) %>% as.data.frame() res_std = Proxy %>% group_by(COMMODITY) %>% do(fitcrop_std(.)) %>% as.data.frame() # plot fitted lines by crops res_long = gather(res %>% select(COMMODITY, YEAR, YIELD, YIELD_EST), VAR, VALUE, YIELD:YIELD_EST, factor_key = T) res_long = mutate(res_long, VAR = ifelse(VAR == "YIELD", "raw", "predicted")) ggplot(res_long, aes(x = YEAR, y = VALUE, color = VAR)) + geom_line() + facet_wrap(.~ COMMODITY, ncol = 2) + labs(colour = "Yield") + xlab("Year") + ylab("Yield") ggsave("../../plots/WeatherIndexByCrops.png", width = 20, height = 24, units = "cm") # plot fitted lines by crops (demean) res_long = gather(res_demean %>% select(COMMODITY, YEAR, YIELD, YIELD_EST), VAR, VALUE, YIELD:YIELD_EST, factor_key = T) res_long = mutate(res_long, VAR = ifelse(VAR == "YIELD", "raw", "predicted")) ggplot(res_long, aes(x = YEAR, y = VALUE, color = VAR)) + geom_line() + facet_wrap(.~ COMMODITY, ncol = 2) + labs(colour = "Yield") + xlab("Year") + ylab("Yield") ggsave("../../plots/WeatherIndexByCrops_demean.png", width = 20, height = 24, units = "cm") # plot fitted lines by crops (standardized) res_long = gather(res_std %>% select(COMMODITY, YEAR, YIELD, YIELD_EST), VAR, VALUE, YIELD:YIELD_EST, factor_key = T) res_long = mutate(res_long, VAR = ifelse(VAR == "YIELD", "raw", "predicted")) ggplot(res_long, aes(x = YEAR, y = VALUE, color = VAR)) + geom_line() + facet_wrap(.~ COMMODITY, ncol = 2) + labs(colour = "Yield") + xlab("Year") + ylab("Yield") ggsave("../../plots/WeatherIndexByCrops_standardized.png", width = 20, height = 24, units = "cm") #### aggregate raw and standardized results # # aggregate to get yield deviation # temp1 = res %>% mutate(INDEX = YIELD*WEIGHT) # temp1 = aggregate(INDEX ~ YEAR, temp1, sum) # temp1$VAR = "Data" # # temp2 = res %>% mutate(INDEX = YIELD_EST*WEIGHT) # temp2 = aggregate(INDEX ~ YEAR, temp2, sum) # temp2$VAR = "Predicted" # # temp = rbind(temp1, temp2) # ggplot(temp, aes(x = YEAR, y = INDEX, color = VAR)) + # geom_line() + # xlab("Year") + # ylab("Yield") + # labs(colour = "Yield") # ggsave("../../plots/WeatherIndex.png", width = 24, height = 20, units = "cm") # # # create INDEX # INDEX = data.frame(YEAR = temp1$YEAR, YIELD = temp1$INDEX, YIELD_EST = temp2$INDEX) # INDEX = INDEX %>% mutate(INDEX = YIELD - YIELD_EST) # write.csv(INDEX, "WI.csv", row.names = F) # rm(temp, temp1, temp2) #### standardized data # aggregate to get yield deviation temp1 = res_std %>% mutate(INDEX = YIELD*WEIGHT) temp1 = aggregate(INDEX ~ YEAR, temp1, sum) temp1$VAR = "Data" temp2 = res_std %>% mutate(INDEX = YIELD_EST*WEIGHT) temp2 = aggregate(INDEX ~ YEAR, temp2, sum) temp2$VAR = "Predicted" temp = rbind(temp1, temp2) ggplot(temp, aes(x = YEAR, y = INDEX, color = VAR)) + geom_line() + xlab("Year") + ylab("Yield") + labs(colour = "Yield") ggsave("../../plots/WeatherIndex_std.png", width = 24, height = 20, units = "cm") write.csv(temp, file = "v2/Figure3.csv") # create INDEX INDEX = data.frame(YEAR = temp1$YEAR, YIELD = temp1$INDEX, YIELD_EST = temp2$INDEX) INDEX = INDEX %>% mutate(INDEX = YIELD - YIELD_EST) write.csv(INDEX, "WI_std.csv", row.names = F) rm(temp, temp1, temp2)