This repository has been archived by the owner on Jun 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
1_fetch.R
511 lines (473 loc) · 17.5 KB
/
1_fetch.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
source("1_fetch/src/get_nwis_sites.R")
source("1_fetch/src/get_daily_nwis_data.R")
source("1_fetch/src/get_inst_nwis_data.R")
source("1_fetch/src/find_sites_multipleTS.R")
source('1_fetch/src/get_nlcd_LC.R')
source("1_fetch/src/get_nhdplusv2.R")
source('1_fetch/src/download_tifs_annual.R')
source("1_fetch/src/get_gf.R")
source("1_fetch/src/fetch_sb_data.R")
source("1_fetch/src/fetch_nhdv2_attributes_from_sb.R")
source("1_fetch/src/download_file.R")
source("1_fetch/src/munge_reach_attr_tbl.R")
source("1_fetch/src/generate_credentials.R")
source("1_fetch/src/check_physio_region.R")
source('2_process/src/write_data.R')
# tar_cue for downloading NWIS sites and data.
# change to 'thorough' to download, and 'never' to prevent downloading.
NWIS_cue = 'never'
NWIS_repository = 'aws'
# Change dummy date to document when NWIS SC sites and data were downloaded
dummy_date <- "2022-06-16"
# Note about 'local' targets:
# The 'local' targets in this file are such b/c the respective fxn
# does not return a single path and therefore targets cannot upload
# to S3 and throws an error.
# See https://github.com/USGS-R/drb-inland-salinity-ml/issues/152
p1_targets_list <- list(
# AWS credentials target
tar_target(
p1_aws_credentials_1,
generate_credentials(),
deployment = 'main',
cue = tar_cue('always'),
priority = 0.99 # default priority (0.8) is set globally in _targets.R
),
# dummy target with high priority. This is here so that no consequential
# targets are rebuilt if we forget to renew AWS credentials.
tar_target(
p1_dummy,
{},
deployment = 'main',
cue = tar_cue('always'),
priority = 0.98 # default priority (0.8) is set globally in _targets.R
),
# Load harmonized WQP data product for discrete samples. Note that this data
# frame was created from an (internal) DRB data harmonization GitLab repository:
# https://code.usgs.gov/wma/iws/wq-data-harmonization/-/commit/b705a2b91877d5e0bb9386250659b46149295b55
# The output rds file was downloaded from the inland salinity sharepoint folder
# and manually placed in 1_fetch/in.
tar_target(
p1_wqp_data_rds,
"1_fetch/in/20220731_DRB_WQdata.rds",
format = 'file',
repository = 'local',
deployment = 'main'
),
tar_target(
p1_wqp_data,
readRDS(file = p1_wqp_data_rds),
deployment = 'main'
),
# Identify NWIS sites with SC data
tar_target(
p1_nwis_sites,
{
dummy <- dummy_date
get_nwis_sites(drb_huc8s,pcodes_select,site_tp_select,stat_cd_select)
},
deployment = 'main',
cue = tar_cue(mode = NWIS_cue),
repository = NWIS_repository
),
# Subset daily NWIS sites
tar_target(
p1_nwis_sites_daily,
p1_nwis_sites %>%
# retain "dv" sites that contain data records after user-specified {earliest_date}
filter(data_type_cd=="dv",!(site_no %in% omit_nwis_sites),
end_date > earliest_date, begin_date < latest_date) %>%
# for sites with multiple time series (ts_id), retain the most recent time series for site_info
group_by(site_no) %>% arrange(desc(end_date)) %>% slice(1),
deployment = 'main',
cue = tar_cue(mode = NWIS_cue),
repository = NWIS_repository
),
# Download NWIS daily data
tar_target(
p1_daily_data,
get_daily_nwis_data(p1_nwis_sites_daily, parameter, stat_cd_select,
start_date = earliest_date, end_date = latest_date),
pattern = map(p1_nwis_sites_daily),
deployment = 'main',
cue = tar_cue(mode = NWIS_cue),
repository = NWIS_repository
),
# Subset NWIS sites with instantaneous (sub-daily) data
tar_target(
p1_nwis_sites_inst,
p1_nwis_sites %>%
# retain "uv" sites that contain data records after user-specified {earliest_date}
filter(data_type_cd=="uv",!(site_no %in% omit_nwis_sites),
end_date > earliest_date, begin_date < latest_date) %>%
# for sites with multiple time series (ts_id), retain the most recent time series for site_info
group_by(site_no) %>% arrange(desc(end_date)) %>% slice(1),
deployment = 'main',
cue = tar_cue(mode = NWIS_cue),
repository = NWIS_repository
),
# Create log file to track sites with multiple time series
tar_target(
p1_nwis_sites_inst_multipleTS_csv,
find_sites_multipleTS(p1_nwis_sites, earliest_date, latest_date, omit_nwis_sites,
"3_visualize/log/summary_multiple_inst_ts.csv"),
format = "file",
deployment = 'main'
),
# Download NWIS instantaneous data
tar_target(
p1_inst_data,
get_inst_nwis_data(p1_nwis_sites_inst,parameter,
start_date = earliest_date, end_date = latest_date),
pattern = map(p1_nwis_sites_inst),
deployment = 'main',
cue = tar_cue(mode = NWIS_cue),
repository = NWIS_repository
),
tar_target(
p1_reaches_shp_zip,
# [Jeff] I downloaded this manually from science base:
# https://www.sciencebase.gov/catalog/item/5f6a285d82ce38aaa244912e
# Because it's a shapefile, it's not easily downloaded using sbtools
# like other files are (see https://github.com/USGS-R/sbtools/issues/277).
# Because of that and since it's small (<700 Kb) I figured it'd be fine to
# just include in the repo and have it loosely referenced to the sb item ^
"1_fetch/in/study_stream_reaches.zip",
format = "file",
repository = 'local',
deployment = 'main'
),
# Unzip zipped shapefile
tar_target(
p1_reaches_shp,
{shapedir = "1_fetch/out/study_stream_reaches"
# `shp_files` is a vector of all files ('dbf', 'prj', 'shp', 'shx')
shp_files <- unzip(p1_reaches_shp_zip, exdir = shapedir)
# return just the .shp file
grep(".shp", shp_files, value = TRUE)},
format = "file",
deployment = 'main'
),
# read shapefile into sf object
tar_target(
p1_reaches_sf,
st_read(p1_reaches_shp,quiet=TRUE),
deployment = 'main'
),
# Download NHDPlusV2 flowlines for DRB
tar_target(
p1_nhdv2reaches_sf,
get_nhdv2_flowlines(drb_huc8s),
deployment = 'main'
),
# Download edited HRU polygons from https://github.com/USGS-R/drb-network-prep
tar_target(
p1_catchments_edited_gpkg,
download_file(GFv1_HRUs_edited_url,
fileout = "1_fetch/out/GFv1_catchments_edited.gpkg",
mode = "wb", quiet = TRUE),
format = "file",
deployment = 'main'
),
# Read in edited HRU polygons
tar_target(
p1_catchments_edited_sf,
sf::st_read(dsn = p1_catchments_edited_gpkg, layer = "GFv1_catchments_edited", quiet = TRUE) %>%
mutate(PRMS_segid_split_col = PRMS_segid) %>%
separate(col = PRMS_segid_split_col, sep = '_', into =c('prms_subseg_seg', "PRMS_segment_suffix")) %>%
mutate(hru_area_km2 = hru_area_m2/10^6,
prms_subseg_seg = case_when(PRMS_segid == '3_1' ~ '3_1',
PRMS_segid == '3_2' ~ '3_2',
PRMS_segid == '8_1' ~ '8_1',
PRMS_segid == '8_2' ~ '8_2',
PRMS_segid == '51_1' ~ '51_1',
PRMS_segid == '51_2' ~ '51_2',
TRUE ~ prms_subseg_seg)) %>%
select(-hru_area_m2, -PRMS_segment_suffix),
deployment = 'main'
),
# Download DRB network attributes
# Retrieved from: https://www.sciencebase.gov/catalog/item/5f6a289982ce38aaa2449135
# see note at top of file about 'local' targets
tar_target(
p1_prms_reach_attr_csvs,
download_sb_file(sb_id = "5f6a289982ce38aaa2449135",
file_name = c("reach_attributes_drb.csv",
"distance_matrix_drb.csv",
"sntemp_inputs_outputs_drb.zip"),
out_dir="1_fetch/out"),
format="file",
deployment = 'main',
repository = 'local'
),
# Read DRB reach attributes with all _1 segments for _2 reaches
tar_target(
p1_prms_reach_attr,
munge_reach_attr_table(p1_prms_reach_attr_csvs)
),
# Read DRB network adjacency matrix
tar_target(
p1_ntw_adj_matrix,
read_csv(grep("distance_matrix",p1_prms_reach_attr_csvs,value=TRUE),
show_col_types = FALSE)
),
# Unzip DRB SNTemp Inputs-Outputs from temperature project
tar_target(
p1_sntemp_inputs_outputs_csv,
unzip(zipfile = grep("sntemp_inputs_outputs",p1_prms_reach_attr_csvs,value=TRUE),
exdir = "1_fetch/out", overwrite = TRUE),
format = "file"
),
# Read DRB SNTemp Inputs-Outputs from temperature project
tar_target(
p1_sntemp_inputs_outputs,
read_csv(p1_sntemp_inputs_outputs_csv,show_col_types = FALSE)
),
# Read in all nlcd data from 2001-2019
# Note: NLCD data must already be downloaded locally and manually placed in NLCD_LC_path ('1_fetch/in/NLCD_final/')
tar_target(
p1_NLCD_LC_data,
read_subset_LC_data(LC_data_folder_path = NLCD_LC_path,
Comids_in_AOI_df = p1_nhdv2reaches_sf %>% st_drop_geometry() %>% select(COMID),
Comid_col = 'COMID', NLCD_type = NULL)
),
# Download and unzip other NLCD 2011 datasets
## Note - this returns a string or vector of strings of data path to unzipped datasets
# see note at top of file about 'local' targets
tar_target(
p1_NLCD2011_data_unzipped,
{zip_files <- download_NHD_data(sb_id = sb_ids_NLCD2011,
out_path = '1_fetch/out',
downloaded_data_folder_name = NLCD2011_folders,
output_data_parent_folder = 'NLCD_LC_2011_Data')
unzipped_files <- unzip_NHD_data(downloaded_data_folder_path = zip_files,
create_unzip_subfolder = TRUE)
rm(zip_files)
unzipped_files
},
format = 'file',
deployment = 'main',
repository = 'local'
),
# Read in NLCD datasets and subset by comid in DRB
## Note that this returns a vector of dfs if more than one NLCD data is in
## p1_NLCD_data_unzipped
tar_target(
p1_NLCD2011_data,
read_subset_LC_data(LC_data_folder_path = p1_NLCD2011_data_unzipped,
Comids_in_AOI_df = p1_nhdv2reaches_sf %>% st_drop_geometry() %>% select(COMID),
Comid_col = 'COMID')
),
# Downlaod FORE-SCE backcasted LC tif files and subset to years we want
## Retrieved from: https://www.sciencebase.gov/catalog/item/605c987fd34ec5fa65eb6a74
## Note - only file #1 DRB_Historical_Reconstruction_1680-2010.zip will be extracted
# see note at top of file about 'local' targets
tar_target(
p1_FORESCE_backcasted_LC,
download_tifs(sb_id = '605c987fd34ec5fa65eb6a74',
filename = 'DRB_Historical_Reconstruction_1680-2010.zip',
download_path = '1_fetch/out',
## Subset downloaded tifs to only process the years that are relevant model
year = FORESCE_years,
name_unzip_folder = NULL,
overwrite_file = TRUE,
name = FORESCE_years),
format = 'file',
deployment = 'main',
repository = 'local'
),
#Targets for the land cover reclassification .csv files
tar_target(
p1_NLCD_reclass_table_csv,
'1_fetch/in/Legend_NLCD_Land_Cover.csv',
format = 'file',
repository = 'local',
deployment = 'main'
),
tar_target(
p1_FORESCE_reclass_table_csv,
'1_fetch/in/Legend_FORESCE_Land_Cover.csv',
format = 'file',
repository = 'local',
deployment = 'main'
),
tar_target(
p1_NLCD_reclass_table,
read_csv(p1_NLCD_reclass_table_csv, show_col_types = FALSE),
deployment = 'main'
),
tar_target(
p1_FORESCE_reclass_table,
read_csv(p1_FORESCE_reclass_table_csv, show_col_types = FALSE),
deployment = 'main'
),
# Downlaod Road Salt accumulation data for the drb
## Retrieved from: https://www.sciencebase.gov/catalog/item/5b15a50ce4b092d9651e22b9
## Note - only zip file named 1992_2015.zip will be extracted
# see note at top of file about 'local' targets
tar_target(
p1_rdsalt,
download_tifs(sb_id = '5b15a50ce4b092d9651e22b9',
filename = '1992_2015.zip',
download_path = '1_fetch/out',
overwrite_file = T,
## no year subsetting here as all years with rdsalt data are relevant here
year = NULL,
name_unzip_folder = 'rd_salt'),
format = 'file',
deployment = 'main',
repository = 'local'
),
# Csv of variables from the Wieczorek dataset that are of interest
tar_target(
p1_vars_of_interest_csv,
'1_fetch/in/NHDVarsOfInterest.csv',
format = 'file',
repository = 'local',
deployment = 'main'
),
# Variables from the Wieczorek dataset that are of interest
# use tar_group to define row groups based on ScienceBase ID; row groups facilitate
# branching over subsets of the VarsOfInterest table in downstream targets
tar_target(
p1_vars_of_interest,
read_csv(p1_vars_of_interest_csv, show_col_types = FALSE) %>%
# Parse sb_id from sb link
mutate(sb_id = str_extract(Science.Base.Link,"[^/]*$")) %>%
# Omit LandCover rows since we are loading those separately
filter(!Theme %in% c('Land Cover')) %>%
group_by(sb_id) %>%
tar_group(),
iteration = "group",
deployment = 'main'
),
# Map over variables of interest to download NHDv2 attribute data from ScienceBase
tar_target(
p1_vars_of_interest_downloaded_csvs,
fetch_nhdv2_attributes_from_sb(vars_item = p1_vars_of_interest, save_dir = "1_fetch/out",
comids = p1_nhdv2reaches_sf$COMID,
delete_local_copies = TRUE),
pattern = map(p1_vars_of_interest),
format = "file",
deployment = 'main'
),
# Download and unzip monthly natural baseflow for the DRB
# from Miller et al. 2021: https://www.sciencebase.gov/catalog/item/6023e628d34e31ed20c874e4
# see note at top of file about 'local' targets
tar_target(
p1_natural_baseflow_csv,
{zip_file <- download_sb_file(sb_id = "6023e628d34e31ed20c874e4",
file_name = "baseflow_partial_model_pred_XX.zip",
out_dir="1_fetch/out")
unzip(zipfile = zip_file,
exdir = dirname(zip_file), overwrite = TRUE)
unzipped_files <- file.path(dirname(zip_file),
list.files(path = dirname(zip_file),
pattern = "*baseflow.*.csv"))
unlink(zip_file, recursive = FALSE)
rm(zip_file)
unzipped_files
},
format = "file",
repository = 'local',
deployment = 'main'
),
# CSV file of gridmet drivers aggregated to PRMS segments
tar_target(
p1_gridmet_csv,
"1_fetch/in/drb_climate_2022_04_06_segments.csv",
format = "file",
repository = "local"
),
# Read gridmet csv into tibble
tar_target(
p1_gridmet,
read_csv(p1_gridmet_csv, show_col_types = FALSE)
),
# Reservoirs shapefile for plotting locations on maps
# list of dams (points) and reservoirs (polygons).
# dams includes the reservoir outlet points.
tar_target(
p1_reservoirs_rds,
"1_fetch/in/filtered_dams_reservoirs.rds",
format = "file",
repository = "local"
),
tar_target(
p1_reservoirs_sf,
readRDS(p1_reservoirs_rds)
),
# Read in .rds file containing the DRB adjacency matrix, referenced to
# subseg_id (PRMS_segid). subseg_distance_matrix.rds was created as part
# of the delaware-model-prep pipeline and was downloaded on 11/3/2022.
# https://github.com/USGS-R/delaware-model-prep/blob/main/1_network.yml#L69
tar_target(
p1_drb_distance_matrix_rds,
"1_fetch/in/subseg_distance_matrix.rds",
format = "file",
repository = "local"
),
tar_target(
p1_drb_distance_matrix,
readRDS(p1_drb_distance_matrix_rds)
),
tar_target(
p1_drb_distance_matrix_updown,
p1_drb_distance_matrix$updown
),
#save as csv for readability
tar_target(
p1_drb_distance_matrix_updown_csv,
{
fileout <- "1_fetch/out/drb_distance_matrix_updown.csv"
write.csv(x = as.data.frame(p1_drb_distance_matrix$updown),
file = fileout,
row.names = TRUE)
fileout
},
format = 'file',
repository = 'local'
),
#save as npz for use in Python
tar_target(
p1_drb_distance_matrix_npz,
write_dist_matrix_npz(p1_drb_distance_matrix,
"1_fetch/out/drb_distance_matrix.npz"),
format = 'file',
repository = 'local'
),
# Download DRB physiographic ecoregions
# Retrieved from: https://www.sciencebase.gov/catalog/item/5d94949de4b0c4f70d0db64f
# see note at top of file about 'local' targets
tar_target(
p1_drb_ecoreg_zip,
download_sb_file(sb_id = "5d94949de4b0c4f70d0db64f",
file_name = "physiographic_regions_DRB.zip",
out_dir="1_fetch/out"),
format="file",
deployment = 'main',
repository = 'local'
),
# Unzip DRB ecoregion / physioregion
tar_target(
p1_drb_ecoreg_shp,
unzip(zipfile = p1_drb_ecoreg_zip, exdir = "1_fetch/out", overwrite = TRUE),
format = "file",
deployment = 'main',
repository = "local"
),
# convert to sf object
tar_target(
p1_drb_ecoreg_sf,
st_read(grep(pattern = 'shp$', x = p1_drb_ecoreg_shp, value = TRUE)) %>%
#4326 matches p1_reaches_sf
st_transform(crs = 4326),
deployment = 'main'
),
# add physioregions to PRMS reach sf object
tar_target(
p1_reaches_ecoreg_sf,
get_physio_regions(reaches = p1_reaches_sf, ecoregions = p1_drb_ecoreg_sf),
deployment = 'main'
)
)