library(RCurl) library(stringr) library(tidyverse) library(data.table) mydata = fread("/Users/shanchaowang/Desktop/qs.crops_20230701.txt", stringsAsFactors = FALSE) # Change this to your local directory where you download USDA data mydata = mydata %>% filter(AGG_LEVEL_DESC == "NATIONAL", FREQ_DESC == "ANNUAL", GROUP_DESC == "FIELD CROPS", SOURCE_DESC == "SURVEY", YEAR >= 1866, YEAR <= 2020) ################## # AREA HARVESTED # # Do this first to find out top ten crops based on acreage AREA = mydata %>% filter(STATISTICCAT_DESC == "AREA HARVESTED", REFERENCE_PERIOD_DESC == "YEAR") %>% select(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR, UNIT_DESC, VALUE) %>% arrange(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR) AREA$VALUE = as.numeric(gsub(",","",AREA$VALUE)) colnames(AREA)[ncol(AREA)] = "AREA" # do not need to convert, all in ACRE unique(AREA$UNIT_DESC) # compare mean area harvested temp = aggregate(AREA ~ COMMODITY_DESC + SHORT_DESC, data = AREA, mean, na.rm = T) %>% arrange(desc(AREA)) # calculate % of total area of top 10 crops sum(temp$AREA[1:10])/sum(temp$AREA) # top 10 filed crops # CORN, HAY, WHEAT, SOYBEANS, OATS, COTTON, SORGHUM, BARLEY, SUNFLOWER (data on AREA from 1975, so skip), RICE, FLAXSEED # use their short description to distinguish varieties of a same crop top10 = c("CORN, GRAIN -", "HAY -", "WHEAT -", "SOYBEANS -", "OATS -", "COTTON -", "SORGHUM, GRAIN -", "BARLEY -", "RICE -", "FLAXSEED -") AREA = AREA %>% filter(grepl(paste(top10, collapse = "|"), SHORT_DESC)) %>% select(COMMODITY_DESC, YEAR, AREA) colnames(AREA) = c("COMMODITY", "YEAR", "AREA") ######### # YIELD # YIELD = mydata %>% filter(STATISTICCAT_DESC == "YIELD", REFERENCE_PERIOD_DESC == "YEAR", grepl(paste(top10, collapse = "|"), SHORT_DESC)) %>% select(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR, UNIT_DESC, VALUE) %>% arrange(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR) YIELD$VALUE = as.numeric(gsub(",","",YIELD$VALUE)) colnames(YIELD)[ncol(YIELD)] = "YIELD" # convert all yield to lb/acre, based on USDA Grain Inspection Handbook, Book II unique(YIELD$UNIT_DESC) YIELD = YIELD %>% mutate(YIELD = ifelse(COMMODITY_DESC == "BARLEY", YIELD*48, ifelse(COMMODITY_DESC %in% c("CORN", "FLAXSEED", "SORGHUM"), YIELD*56, ifelse(COMMODITY_DESC %in% c("SOYBEANS", "WHEAT"), YIELD*60, ifelse(COMMODITY_DESC == "HAY", YIELD*2204.62, ifelse(COMMODITY_DESC == "OATS", YIELD*32, YIELD)))))) YIELD = YIELD %>% select(COMMODITY_DESC, YEAR, YIELD) colnames(YIELD)[1] = "COMMODITY" ################### # PRODUCTION in $ # VOP = mydata %>% filter(grepl("PRODUCTION, MEASURED IN \\$", SHORT_DESC), REFERENCE_PERIOD_DESC == "YEAR", grepl(paste(top10, collapse = "|"), SHORT_DESC)) %>% select(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR, UNIT_DESC, VALUE) %>% arrange(COMMODITY_DESC, CLASS_DESC, SHORT_DESC, YEAR) VOP$VALUE = as.numeric(gsub(",","",VOP$VALUE)) VOP = VOP %>% select(COMMODITY_DESC, YEAR, VALUE) colnames(VOP) = c("COMMODITY", "YEAR", "VOP") ################### # merge data and save res = Reduce(function(...) merge(..., all = T), list(YIELD, AREA, VOP)) res = res %>% mutate(PRODUCTION = YIELD*AREA) write.csv(res, "USDACropData.csv", row.names = F) # save as a local data file. Added in Wang et al. _R&D lags_Regression-Data (2023-06-02) in the yield tab.