Coverage for src/efts_io/variables.py: 19.74%

62 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-07-24 10:14 +1000

1"""Handling of EFTS netCDF variables definitions.""" 

2 

3from typing import Any, Dict, Optional, Tuple 

4 

5# import netCDF4 

6import numpy as np 

7import pandas as pd 

8import xarray as xr 

9 

10from efts_io._internals import create_data_variable 

11from efts_io.attributes import create_var_attribute_definition 

12from efts_io.conventions import ( 

13 AREA_VARNAME, 

14 ENS_MEMBER_DIMNAME, 

15 FILLVALUE_ATTR_KEY, 

16 LEAD_TIME_DIMNAME, 

17 STANDARD_NAME_ATTR_KEY, 

18 STATION_DIMNAME, 

19 UNITS_ATTR_KEY, 

20) 

21from efts_io.dimensions import _create_nc_dims 

22 

23 

24#' Create a variable definition 

25#' 

26#' Create a variable definition usable by the function \code{\link{create_efts_variables}} to create netCDF variables. 

27#' 

28#' @param name variable name 

29#' @param longname variable long name 

30#' @param units variable units 

31#' @param missval value code for missing data 

32#' @param precision precision 

33#' @param dim_type dimension type (EFTS integer code) 

34#' @param var_attribute list of attributes for the netCDF variable to create 

35#' @export 

36#' @return a list 

37#' @examples 

38#' var_def = create_variable_definition(name='rain_der', 

39#' longname='Rainfall ensemble forecast derived from some prediction', units='mm', 

40#' missval=-9999.0, precision='double', var_attribute=list(type=2L, 

41#' description="accumulated over the preceding interval", 

42#' dat_type = "der", dat_type_description="AWAP data interpolated from observations", 

43#' location_type = "Point")) 

44def create_variable_definition( 

45 name: str, 

46 longname: str = "", 

47 units: str = "mm", 

48 missval: float = -9999.0, 

49 precision: str = "double", 

50 dim_type: str = "4", 

51 var_attribute: Optional[dict[str,str]] = None, 

52) -> dict[str, Any]: 

53 """Create a variable definition.""" 

54 if var_attribute is None: 

55 var_attribute = create_var_attribute_definition() 

56 return { 

57 "name": name, 

58 "longname": longname, 

59 UNITS_ATTR_KEY: units, 

60 "dim_type": dim_type, 

61 "missval": missval, 

62 "precision": precision, 

63 "attributes": var_attribute, 

64 } 

65 

66 

67# #' Create a variables definition data frame 

68# #' 

69# #' Create a variable definition usable by the function \code{\link{create_variable_definitions}} 

70# #' to create netCDF variables. The use of this function is not compulsory to create a EFTS 

71# #' netCDF schema, just offered as a convenience. 

72# #' 

73# #' @param variable_names character vector, names of the variables 

74# #' @param long_names character vector, long names of the variables (defaults to variable_names if missing) 

75# #' @param standard_names character vector, standard names of the variables (optional, defaults to variable_names) 

76# #' @param units character vector, units for the variable(s) 

77# #' @param missval numeric vector, missing value code(s) for the variable(s) 

78# #' @param precision character vector, precision of the variables 

79# #' @param dimensions character or integer vector, number of dimensions each variable (2, 3 or 4) 

80# #' @param var_attributes a list of named attributes. See \code{\link{create_var_attribute_definition}} 

81# #' @export 

82# #' @return a data frame suitable for \code{\link{create_variable_definition}} 

83# #' @seealso See 

84# #' \code{\link{create_variable_definition}} and \code{\link{create_efts}} for examples 

85# create_variable_definition_dataframe(variable_names, long_names = variable_names, standard_names = variable_names, units = "mm", missval = -9999.0, 

86# precision = "double", dimensions = 4L, var_attributes = create_var_attribute_definition()) { 

87# stopifnot(is.character(variable_names)) 

88# varsDef = data.frame(name = variable_names, stringsAsFactors = FALSE) 

89# varsDef$longname = long_names 

90# varsDef$standard_name = standard_names 

91# varsDef$units = units 

92# varsDef$missval = missval 

93# varsDef$precision = precision 

94# varsDef$dimensions = as.integer(dimensions) 

95 

96# va = data.frame(var_attributes, stringsAsFactors = FALSE) 

97# if(nrow(va) < nrow(varsDef)) { 

98# va = va[ rep(1:nrow(va), length.out=nrow(varsDef)), ] 

99# } 

100 

101# varsDef = cbind(varsDef, va) 

102# rownames(varsDef) = varsDef$name 

103# return(varsDef) 

104# } 

105 

106 

107#' Provide a template definition of optional geolocation variables 

108#' 

109#' Provide a template definition of optional geolocation and geographic variables x, y, area and elevation. 

110#' See \url{https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#optional-variables}. 

111#' 

112#' @export 

113#' @return a data frame 

114#' @seealso See 

115#' \code{\link{create_variable_definition}} and \code{\link{create_efts}} for examples 

116#' @export 

117def default_optional_variable_definitions_v2_0() -> pd.DataFrame: 

118 """Provide a template definition of optional geolocation variables.""" 

119 return pd.DataFrame.from_dict( 

120 { 

121 "name": ["x", "y", AREA_VARNAME, "elevation"], 

122 "longname": [ 

123 "easting from the GDA94 datum in MGA Zone 55", 

124 "northing from the GDA94 datum in MGA Zone 55", 

125 "catchment area", 

126 "station elevation above sea level", 

127 ], 

128 STANDARD_NAME_ATTR_KEY: [ 

129 "northing_GDA94_zone55", 

130 "easting_GDA94_zone55", 

131 AREA_VARNAME, 

132 "elevation", 

133 ], 

134 UNITS_ATTR_KEY: ["", "", "km^2", "m"], 

135 "missval": [np.nan, np.nan, -9999.0, -9999.0], 

136 "precision": np.repeat("float", 4), 

137 }, 

138 ) 

139 

140 

141# ######################################## 

142# # Below are functions not exported 

143# ######################################## 

144 

145 

146#' Create variable definitions from a data frame 

147#' 

148#' Given a data frame as input, create a list of variable definitions usable by the function \code{\link{create_efts_variables}} to create netCDF variables. 

149#' 

150#' @param dframe a data frame, one line is one variable definition. Must have at least the following column names: 'name', 'longname', 'units', 'missval', 'precision', 'type', 'type_description', 'location_type' 

151#' @export 

152#' @return a list of length equal to the number of rows in the input data frame 

153#' @seealso See 

154#' \code{\link{create_efts}} for examples 

155#' @examples 

156#' varsDef = data.frame(name=letters[1:3], stringsAsFactors=FALSE) 

157#' varsDef$longname=paste('long name for', varsDef$name) 

158#' varsDef$units='mm' 

159#' varsDef$missval=-999.0 

160#' varsDef$precision='double' 

161#' varsDef$type=2 

162#' varsDef$type_description='accumulated over the previous time step' 

163#' varsDef$location_type='Point' 

164#' str(create_variable_definitions(varsDef)) 

165#' 

166def create_variable_definitions(dframe: pd.DataFrame) -> Dict[str, Any]: 

167 """Create variable definitions from a data frame.""" 

168 in_names = dframe.columns 

169 non_opt_attr = ["name", "longname", UNITS_ATTR_KEY, "missval", "precision", "dimensions"] 

170 varargs_attr = [x for x in in_names if x not in non_opt_attr] 

171 

172 def dataframe_to_dict(df: pd.DataFrame, columns: list) -> dict: 

173 """Convert a single-row DataFrame to a dictionary for specified columns.""" 

174 if not isinstance(df, pd.Series): 

175 raise TypeError("single row of a data frame: expected a pandas series") 

176 return {col: df[col] for col in columns if col in df} 

177 

178 def f(var_def: Dict[str, Any]): # noqa: ANN202 

179 return create_variable_definition( 

180 name=var_def["name"], 

181 longname=var_def["longname"], 

182 units=var_def[UNITS_ATTR_KEY], 

183 missval=var_def["missval"], 

184 precision=var_def["precision"], 

185 dim_type=var_def["dimensions"], 

186 var_attribute=dataframe_to_dict(var_def, varargs_attr), 

187 ) 

188 

189 # dframe[['rownum']] = 1:nrow(dframe) 

190 # r = plyr::dlply(.data = dframe, .variables = "rownum", .fun = f) 

191 variables_defs: Dict = dframe.apply(lambda x: f(x), axis=1).to_dict() 

192 return {v["name"]: v for _, v in variables_defs.items()} 

193 

194 

195def create_mandatory_vardefs( 

196 station_dim: Tuple[str, np.ndarray, Dict[str, str]], 

197 str_dim: Tuple[str, np.ndarray, Dict[str, str]], 

198 ensemble_dim: Tuple[str, np.ndarray, Dict[str, str]], 

199 lead_time_dim: Tuple[str, np.ndarray, Dict[str, str]], 

200 lead_time_tstep: str = "hours", 

201) -> Dict[str, xr.Variable]: 

202 """Create mandatory variable definitions.""" 

203 # https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#mandatory-variables 

204 # float time(time) 

205 # int station_id(station) 

206 # char station_name(strLen, station) 

207 # int ens_member(ens_member) 

208 # float lead_time(lead_time) 

209 # float lat (station) 

210 # float lon (station) 

211 

212 # STATION_DIMNAME, 

213 # LEAD_TIME_DIMNAME, 

214 # TIME_DIMNAME, 

215 # ENS_MEMBER_DIMNAME, 

216 # STR_LEN_DIMNAME, 

217 

218 station_id_variable = xr.Variable( 

219 dims=[STATION_DIMNAME], 

220 data=station_dim[1], 

221 encoding={FILLVALUE_ATTR_KEY: None}, 

222 attrs={ 

223 "longname": station_dim[2]["longname"], 

224 UNITS_ATTR_KEY: "", 

225 "missval": None, 

226 "precision": "integer", 

227 }, 

228 ) 

229 station_names_dim_variable = xr.Variable( 

230 dims=[str_dim[0], STATION_DIMNAME], 

231 # That was not intuitive to create this empty array. Not entirely sure this is what we want. 

232 data=np.empty_like( 

233 prototype=b"", 

234 shape=(len(str_dim[1]), len(station_dim[1])), 

235 dtype=np.bytes_, 

236 ), 

237 encoding={FILLVALUE_ATTR_KEY: None}, 

238 attrs={ 

239 "longname": "station or node name", 

240 UNITS_ATTR_KEY: "", 

241 "missval": None, 

242 "precision": "char", 

243 }, 

244 ) 

245 ensemble_member_id_variable = xr.Variable( 

246 dims=[ENS_MEMBER_DIMNAME], 

247 data=ensemble_dim[1], 

248 encoding={FILLVALUE_ATTR_KEY: None}, 

249 attrs={ 

250 "longname": ensemble_dim[2]["longname"], 

251 UNITS_ATTR_KEY: "", 

252 "missval": None, 

253 "precision": "integer", 

254 }, 

255 ) 

256 lead_time_dim_variable = xr.Variable( 

257 dims=[LEAD_TIME_DIMNAME], 

258 data=lead_time_dim[1], 

259 encoding={FILLVALUE_ATTR_KEY: None}, 

260 attrs={ 

261 "longname": lead_time_dim[2]["longname"], 

262 UNITS_ATTR_KEY: lead_time_tstep + " since time", 

263 "missval": None, 

264 "precision": "integer", 

265 }, 

266 ) 

267 latitude_dim_variable = xr.Variable( 

268 dims=[STATION_DIMNAME], 

269 data=np.empty_like(station_dim[1], dtype=float), 

270 encoding={FILLVALUE_ATTR_KEY: -9999.0}, 

271 attrs={ 

272 "longname": "latitude", 

273 UNITS_ATTR_KEY: "degrees north", 

274 "missval": -9999.0, 

275 "precision": "float", 

276 }, 

277 ) 

278 longitude_dim_variable = xr.Variable( 

279 dims=[STATION_DIMNAME], 

280 data=np.empty_like(station_dim[1], dtype=float), 

281 encoding={FILLVALUE_ATTR_KEY: -9999.0}, 

282 attrs={ 

283 "longname": "longitude", 

284 UNITS_ATTR_KEY: "degrees east", 

285 "missval": -9999.0, 

286 "precision": "float", 

287 }, 

288 ) 

289 

290 return { 

291 "station_ids_var": station_id_variable, 

292 "station_names_var": station_names_dim_variable, 

293 "ensemble_var": ensemble_member_id_variable, 

294 "lead_time_var": lead_time_dim_variable, 

295 "latitude_var": latitude_dim_variable, 

296 "longitude_var": longitude_dim_variable, 

297 } 

298 

299 

300def create_optional_vardefs( 

301 station_dim: Tuple[str, np.ndarray, Dict[str, str]], 

302 vars_def: Optional[pd.DataFrame] = None, 

303) -> pd.Series: 

304 """Create optional variable definitions.""" 

305 if vars_def is None: 

306 vars_def = default_optional_variable_definitions_v2_0() 

307 

308 # https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#mandatory-variables 

309 # vars_def$rownum = 1:nrow(vars_def) 

310 def f(vd: Dict): # noqa: ANN202 

311 return { 

312 "name": vd["name"], 

313 UNITS_ATTR_KEY: vd[UNITS_ATTR_KEY], 

314 "dim": list(station_dim[0]), # TOCHECK or not a list but the str? 

315 "missval": vd["missval"], 

316 "longname": vd["longname"], 

317 "prec": vd["precision"], 

318 } 

319 

320 return vars_def.apply(lambda x: f(x), axis=1) 

321 

322 

323#' Create netCDF variables according to the definition 

324#' 

325#' Create netCDF variables according to the definition 

326#' 

327#' @param data_var_def a list, with each item itself a list suitable as a variable definition argument to create_data_variable 

328#' @param time_dim_info a list with the units and values defining the time dimension of the data set 

329#' @param num_stations number of (gauging) stations identifying points in the data set 

330#' @param lead_length length of the lead forecasting time series. 

331#' @param ensemble_length number of ensembles, i.e. number of forecasts for each point on the main time axis of the data set 

332#' @param optional_vars a data frame defining optional netCDF variables. For a templated default see 

333#' \code{\link{default_optional_variable_definitions_v2_0}} and 

334#' \url{https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#optional-variables} 

335#' @param lead_time_tstep string specifying the time step of the forecast lead length. 

336#' @seealso See 

337#' \code{\link{create_efts}} for examples 

338def create_efts_variables( 

339 data_var_def: Dict, 

340 time_dim_info: Dict, 

341 num_stations: int, 

342 lead_length: int, 

343 ensemble_length: int, 

344 optional_vars: Optional[pd.DataFrame], 

345 lead_time_tstep: str, 

346) -> Dict[str, Any]: 

347 """Create netCDF variables according to the definition.""" 

348 efts_dims = _create_nc_dims( 

349 time_dim_info=time_dim_info, 

350 num_stations=num_stations, 

351 lead_length=lead_length, 

352 ensemble_length=ensemble_length, 

353 ) 

354 

355 time_dim = efts_dims["time_dim"] 

356 lead_time_dim = efts_dims["lead_time_dim"] 

357 station_dim = efts_dims["station_dim"] 

358 str_dim = efts_dims["str_dim"] 

359 ensemble_dim = efts_dims["ensemble_dim"] 

360 

361 mandatory_var_ncdefs = create_mandatory_vardefs( 

362 station_dim, 

363 str_dim, 

364 ensemble_dim, 

365 lead_time_dim, 

366 lead_time_tstep, 

367 ) 

368 variables_metadata = mandatory_var_ncdefs 

369 if optional_vars is not None: 

370 optional_var_ncdefs = create_optional_vardefs( 

371 station_dim, 

372 vars_def=optional_vars, 

373 ) 

374 # TODO if not native to ncdf4: check name clashes 

375 # already_defs = names(variables) 

376 variables_metadata.update(optional_var_ncdefs) 

377 

378 unknown_dims = [x for x in data_var_def.values() if x["dim_type"] not in ["2", "3", "4"]] 

379 if len(unknown_dims) > 0: 

380 raise ValueError( 

381 f"Invalid dimension specifications for {len(unknown_dims)} variables. Only supported are characters 2, 3, 4", 

382 ) 

383 

384 variables = {} 

385 variables["metadatavars"] = variables_metadata 

386 

387 data_variables = empty_data_variables(data_var_def, time_dim, lead_time_dim, station_dim, ensemble_dim) 

388 variables["datavars"] = data_variables 

389 

390 return variables 

391 

392 

393def empty_data_variables( 

394 data_var_def: dict, 

395 time_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001 

396 lead_time_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001 

397 station_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001 

398 ensemble_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001 

399) -> dict: 

400 """Create data variables as defined in the definition.""" 

401 raise NotImplementedError("Not implemented yet") 

402 

403 data_variables = {} 

404 

405 ens_fcast_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "4"] 

406 ens_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "3"] 

407 point_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "2"] 

408 

409 time_dim = "not implemented" 

410 lead_time_dim = "not implemented" 

411 station_dim = "not implemented" 

412 ensemble_dim = "not implemented" 

413 

414 data_variables.update( 

415 { 

416 x["name"]: create_data_variable( 

417 x, 

418 [lead_time_dim, station_dim, ensemble_dim, time_dim], 

419 ) 

420 for x in ens_fcast_data_var_def 

421 }, 

422 ) 

423 data_variables.update( 

424 {x["name"]: create_data_variable(x, [station_dim, ensemble_dim, time_dim]) for x in ens_data_var_def}, 

425 ) 

426 data_variables.update( 

427 {x["name"]: create_data_variable(x, [station_dim, time_dim]) for x in point_data_var_def}, 

428 ) 

429 

430 return data_variables