Coverage for src/efts_io/variables.py: 19.74%
62 statements
« prev ^ index » next coverage.py v7.6.1, created at 2025-07-24 10:14 +1000
« prev ^ index » next coverage.py v7.6.1, created at 2025-07-24 10:14 +1000
1"""Handling of EFTS netCDF variables definitions."""
3from typing import Any, Dict, Optional, Tuple
5# import netCDF4
6import numpy as np
7import pandas as pd
8import xarray as xr
10from efts_io._internals import create_data_variable
11from efts_io.attributes import create_var_attribute_definition
12from efts_io.conventions import (
13 AREA_VARNAME,
14 ENS_MEMBER_DIMNAME,
15 FILLVALUE_ATTR_KEY,
16 LEAD_TIME_DIMNAME,
17 STANDARD_NAME_ATTR_KEY,
18 STATION_DIMNAME,
19 UNITS_ATTR_KEY,
20)
21from efts_io.dimensions import _create_nc_dims
24#' Create a variable definition
25#'
26#' Create a variable definition usable by the function \code{\link{create_efts_variables}} to create netCDF variables.
27#'
28#' @param name variable name
29#' @param longname variable long name
30#' @param units variable units
31#' @param missval value code for missing data
32#' @param precision precision
33#' @param dim_type dimension type (EFTS integer code)
34#' @param var_attribute list of attributes for the netCDF variable to create
35#' @export
36#' @return a list
37#' @examples
38#' var_def = create_variable_definition(name='rain_der',
39#' longname='Rainfall ensemble forecast derived from some prediction', units='mm',
40#' missval=-9999.0, precision='double', var_attribute=list(type=2L,
41#' description="accumulated over the preceding interval",
42#' dat_type = "der", dat_type_description="AWAP data interpolated from observations",
43#' location_type = "Point"))
44def create_variable_definition(
45 name: str,
46 longname: str = "",
47 units: str = "mm",
48 missval: float = -9999.0,
49 precision: str = "double",
50 dim_type: str = "4",
51 var_attribute: Optional[dict[str,str]] = None,
52) -> dict[str, Any]:
53 """Create a variable definition."""
54 if var_attribute is None:
55 var_attribute = create_var_attribute_definition()
56 return {
57 "name": name,
58 "longname": longname,
59 UNITS_ATTR_KEY: units,
60 "dim_type": dim_type,
61 "missval": missval,
62 "precision": precision,
63 "attributes": var_attribute,
64 }
67# #' Create a variables definition data frame
68# #'
69# #' Create a variable definition usable by the function \code{\link{create_variable_definitions}}
70# #' to create netCDF variables. The use of this function is not compulsory to create a EFTS
71# #' netCDF schema, just offered as a convenience.
72# #'
73# #' @param variable_names character vector, names of the variables
74# #' @param long_names character vector, long names of the variables (defaults to variable_names if missing)
75# #' @param standard_names character vector, standard names of the variables (optional, defaults to variable_names)
76# #' @param units character vector, units for the variable(s)
77# #' @param missval numeric vector, missing value code(s) for the variable(s)
78# #' @param precision character vector, precision of the variables
79# #' @param dimensions character or integer vector, number of dimensions each variable (2, 3 or 4)
80# #' @param var_attributes a list of named attributes. See \code{\link{create_var_attribute_definition}}
81# #' @export
82# #' @return a data frame suitable for \code{\link{create_variable_definition}}
83# #' @seealso See
84# #' \code{\link{create_variable_definition}} and \code{\link{create_efts}} for examples
85# create_variable_definition_dataframe(variable_names, long_names = variable_names, standard_names = variable_names, units = "mm", missval = -9999.0,
86# precision = "double", dimensions = 4L, var_attributes = create_var_attribute_definition()) {
87# stopifnot(is.character(variable_names))
88# varsDef = data.frame(name = variable_names, stringsAsFactors = FALSE)
89# varsDef$longname = long_names
90# varsDef$standard_name = standard_names
91# varsDef$units = units
92# varsDef$missval = missval
93# varsDef$precision = precision
94# varsDef$dimensions = as.integer(dimensions)
96# va = data.frame(var_attributes, stringsAsFactors = FALSE)
97# if(nrow(va) < nrow(varsDef)) {
98# va = va[ rep(1:nrow(va), length.out=nrow(varsDef)), ]
99# }
101# varsDef = cbind(varsDef, va)
102# rownames(varsDef) = varsDef$name
103# return(varsDef)
104# }
107#' Provide a template definition of optional geolocation variables
108#'
109#' Provide a template definition of optional geolocation and geographic variables x, y, area and elevation.
110#' See \url{https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#optional-variables}.
111#'
112#' @export
113#' @return a data frame
114#' @seealso See
115#' \code{\link{create_variable_definition}} and \code{\link{create_efts}} for examples
116#' @export
117def default_optional_variable_definitions_v2_0() -> pd.DataFrame:
118 """Provide a template definition of optional geolocation variables."""
119 return pd.DataFrame.from_dict(
120 {
121 "name": ["x", "y", AREA_VARNAME, "elevation"],
122 "longname": [
123 "easting from the GDA94 datum in MGA Zone 55",
124 "northing from the GDA94 datum in MGA Zone 55",
125 "catchment area",
126 "station elevation above sea level",
127 ],
128 STANDARD_NAME_ATTR_KEY: [
129 "northing_GDA94_zone55",
130 "easting_GDA94_zone55",
131 AREA_VARNAME,
132 "elevation",
133 ],
134 UNITS_ATTR_KEY: ["", "", "km^2", "m"],
135 "missval": [np.nan, np.nan, -9999.0, -9999.0],
136 "precision": np.repeat("float", 4),
137 },
138 )
141# ########################################
142# # Below are functions not exported
143# ########################################
146#' Create variable definitions from a data frame
147#'
148#' Given a data frame as input, create a list of variable definitions usable by the function \code{\link{create_efts_variables}} to create netCDF variables.
149#'
150#' @param dframe a data frame, one line is one variable definition. Must have at least the following column names: 'name', 'longname', 'units', 'missval', 'precision', 'type', 'type_description', 'location_type'
151#' @export
152#' @return a list of length equal to the number of rows in the input data frame
153#' @seealso See
154#' \code{\link{create_efts}} for examples
155#' @examples
156#' varsDef = data.frame(name=letters[1:3], stringsAsFactors=FALSE)
157#' varsDef$longname=paste('long name for', varsDef$name)
158#' varsDef$units='mm'
159#' varsDef$missval=-999.0
160#' varsDef$precision='double'
161#' varsDef$type=2
162#' varsDef$type_description='accumulated over the previous time step'
163#' varsDef$location_type='Point'
164#' str(create_variable_definitions(varsDef))
165#'
166def create_variable_definitions(dframe: pd.DataFrame) -> Dict[str, Any]:
167 """Create variable definitions from a data frame."""
168 in_names = dframe.columns
169 non_opt_attr = ["name", "longname", UNITS_ATTR_KEY, "missval", "precision", "dimensions"]
170 varargs_attr = [x for x in in_names if x not in non_opt_attr]
172 def dataframe_to_dict(df: pd.DataFrame, columns: list) -> dict:
173 """Convert a single-row DataFrame to a dictionary for specified columns."""
174 if not isinstance(df, pd.Series):
175 raise TypeError("single row of a data frame: expected a pandas series")
176 return {col: df[col] for col in columns if col in df}
178 def f(var_def: Dict[str, Any]): # noqa: ANN202
179 return create_variable_definition(
180 name=var_def["name"],
181 longname=var_def["longname"],
182 units=var_def[UNITS_ATTR_KEY],
183 missval=var_def["missval"],
184 precision=var_def["precision"],
185 dim_type=var_def["dimensions"],
186 var_attribute=dataframe_to_dict(var_def, varargs_attr),
187 )
189 # dframe[['rownum']] = 1:nrow(dframe)
190 # r = plyr::dlply(.data = dframe, .variables = "rownum", .fun = f)
191 variables_defs: Dict = dframe.apply(lambda x: f(x), axis=1).to_dict()
192 return {v["name"]: v for _, v in variables_defs.items()}
195def create_mandatory_vardefs(
196 station_dim: Tuple[str, np.ndarray, Dict[str, str]],
197 str_dim: Tuple[str, np.ndarray, Dict[str, str]],
198 ensemble_dim: Tuple[str, np.ndarray, Dict[str, str]],
199 lead_time_dim: Tuple[str, np.ndarray, Dict[str, str]],
200 lead_time_tstep: str = "hours",
201) -> Dict[str, xr.Variable]:
202 """Create mandatory variable definitions."""
203 # https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#mandatory-variables
204 # float time(time)
205 # int station_id(station)
206 # char station_name(strLen, station)
207 # int ens_member(ens_member)
208 # float lead_time(lead_time)
209 # float lat (station)
210 # float lon (station)
212 # STATION_DIMNAME,
213 # LEAD_TIME_DIMNAME,
214 # TIME_DIMNAME,
215 # ENS_MEMBER_DIMNAME,
216 # STR_LEN_DIMNAME,
218 station_id_variable = xr.Variable(
219 dims=[STATION_DIMNAME],
220 data=station_dim[1],
221 encoding={FILLVALUE_ATTR_KEY: None},
222 attrs={
223 "longname": station_dim[2]["longname"],
224 UNITS_ATTR_KEY: "",
225 "missval": None,
226 "precision": "integer",
227 },
228 )
229 station_names_dim_variable = xr.Variable(
230 dims=[str_dim[0], STATION_DIMNAME],
231 # That was not intuitive to create this empty array. Not entirely sure this is what we want.
232 data=np.empty_like(
233 prototype=b"",
234 shape=(len(str_dim[1]), len(station_dim[1])),
235 dtype=np.bytes_,
236 ),
237 encoding={FILLVALUE_ATTR_KEY: None},
238 attrs={
239 "longname": "station or node name",
240 UNITS_ATTR_KEY: "",
241 "missval": None,
242 "precision": "char",
243 },
244 )
245 ensemble_member_id_variable = xr.Variable(
246 dims=[ENS_MEMBER_DIMNAME],
247 data=ensemble_dim[1],
248 encoding={FILLVALUE_ATTR_KEY: None},
249 attrs={
250 "longname": ensemble_dim[2]["longname"],
251 UNITS_ATTR_KEY: "",
252 "missval": None,
253 "precision": "integer",
254 },
255 )
256 lead_time_dim_variable = xr.Variable(
257 dims=[LEAD_TIME_DIMNAME],
258 data=lead_time_dim[1],
259 encoding={FILLVALUE_ATTR_KEY: None},
260 attrs={
261 "longname": lead_time_dim[2]["longname"],
262 UNITS_ATTR_KEY: lead_time_tstep + " since time",
263 "missval": None,
264 "precision": "integer",
265 },
266 )
267 latitude_dim_variable = xr.Variable(
268 dims=[STATION_DIMNAME],
269 data=np.empty_like(station_dim[1], dtype=float),
270 encoding={FILLVALUE_ATTR_KEY: -9999.0},
271 attrs={
272 "longname": "latitude",
273 UNITS_ATTR_KEY: "degrees north",
274 "missval": -9999.0,
275 "precision": "float",
276 },
277 )
278 longitude_dim_variable = xr.Variable(
279 dims=[STATION_DIMNAME],
280 data=np.empty_like(station_dim[1], dtype=float),
281 encoding={FILLVALUE_ATTR_KEY: -9999.0},
282 attrs={
283 "longname": "longitude",
284 UNITS_ATTR_KEY: "degrees east",
285 "missval": -9999.0,
286 "precision": "float",
287 },
288 )
290 return {
291 "station_ids_var": station_id_variable,
292 "station_names_var": station_names_dim_variable,
293 "ensemble_var": ensemble_member_id_variable,
294 "lead_time_var": lead_time_dim_variable,
295 "latitude_var": latitude_dim_variable,
296 "longitude_var": longitude_dim_variable,
297 }
300def create_optional_vardefs(
301 station_dim: Tuple[str, np.ndarray, Dict[str, str]],
302 vars_def: Optional[pd.DataFrame] = None,
303) -> pd.Series:
304 """Create optional variable definitions."""
305 if vars_def is None:
306 vars_def = default_optional_variable_definitions_v2_0()
308 # https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#mandatory-variables
309 # vars_def$rownum = 1:nrow(vars_def)
310 def f(vd: Dict): # noqa: ANN202
311 return {
312 "name": vd["name"],
313 UNITS_ATTR_KEY: vd[UNITS_ATTR_KEY],
314 "dim": list(station_dim[0]), # TOCHECK or not a list but the str?
315 "missval": vd["missval"],
316 "longname": vd["longname"],
317 "prec": vd["precision"],
318 }
320 return vars_def.apply(lambda x: f(x), axis=1)
323#' Create netCDF variables according to the definition
324#'
325#' Create netCDF variables according to the definition
326#'
327#' @param data_var_def a list, with each item itself a list suitable as a variable definition argument to create_data_variable
328#' @param time_dim_info a list with the units and values defining the time dimension of the data set
329#' @param num_stations number of (gauging) stations identifying points in the data set
330#' @param lead_length length of the lead forecasting time series.
331#' @param ensemble_length number of ensembles, i.e. number of forecasts for each point on the main time axis of the data set
332#' @param optional_vars a data frame defining optional netCDF variables. For a templated default see
333#' \code{\link{default_optional_variable_definitions_v2_0}} and
334#' \url{https://github.com/jmp75/efts/blob/107c553045a37e6ef36b2eababf6a299e7883d50/docs/netcdf_for_water_forecasting.md#optional-variables}
335#' @param lead_time_tstep string specifying the time step of the forecast lead length.
336#' @seealso See
337#' \code{\link{create_efts}} for examples
338def create_efts_variables(
339 data_var_def: Dict,
340 time_dim_info: Dict,
341 num_stations: int,
342 lead_length: int,
343 ensemble_length: int,
344 optional_vars: Optional[pd.DataFrame],
345 lead_time_tstep: str,
346) -> Dict[str, Any]:
347 """Create netCDF variables according to the definition."""
348 efts_dims = _create_nc_dims(
349 time_dim_info=time_dim_info,
350 num_stations=num_stations,
351 lead_length=lead_length,
352 ensemble_length=ensemble_length,
353 )
355 time_dim = efts_dims["time_dim"]
356 lead_time_dim = efts_dims["lead_time_dim"]
357 station_dim = efts_dims["station_dim"]
358 str_dim = efts_dims["str_dim"]
359 ensemble_dim = efts_dims["ensemble_dim"]
361 mandatory_var_ncdefs = create_mandatory_vardefs(
362 station_dim,
363 str_dim,
364 ensemble_dim,
365 lead_time_dim,
366 lead_time_tstep,
367 )
368 variables_metadata = mandatory_var_ncdefs
369 if optional_vars is not None:
370 optional_var_ncdefs = create_optional_vardefs(
371 station_dim,
372 vars_def=optional_vars,
373 )
374 # TODO if not native to ncdf4: check name clashes
375 # already_defs = names(variables)
376 variables_metadata.update(optional_var_ncdefs)
378 unknown_dims = [x for x in data_var_def.values() if x["dim_type"] not in ["2", "3", "4"]]
379 if len(unknown_dims) > 0:
380 raise ValueError(
381 f"Invalid dimension specifications for {len(unknown_dims)} variables. Only supported are characters 2, 3, 4",
382 )
384 variables = {}
385 variables["metadatavars"] = variables_metadata
387 data_variables = empty_data_variables(data_var_def, time_dim, lead_time_dim, station_dim, ensemble_dim)
388 variables["datavars"] = data_variables
390 return variables
393def empty_data_variables(
394 data_var_def: dict,
395 time_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001
396 lead_time_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001
397 station_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001
398 ensemble_dim_tmp: Tuple[str, np.ndarray, Dict[str, str]], # noqa: ARG001
399) -> dict:
400 """Create data variables as defined in the definition."""
401 raise NotImplementedError("Not implemented yet")
403 data_variables = {}
405 ens_fcast_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "4"]
406 ens_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "3"]
407 point_data_var_def = [x for x in data_var_def.values() if x["dim_type"] == "2"]
409 time_dim = "not implemented"
410 lead_time_dim = "not implemented"
411 station_dim = "not implemented"
412 ensemble_dim = "not implemented"
414 data_variables.update(
415 {
416 x["name"]: create_data_variable(
417 x,
418 [lead_time_dim, station_dim, ensemble_dim, time_dim],
419 )
420 for x in ens_fcast_data_var_def
421 },
422 )
423 data_variables.update(
424 {x["name"]: create_data_variable(x, [station_dim, ensemble_dim, time_dim]) for x in ens_data_var_def},
425 )
426 data_variables.update(
427 {x["name"]: create_data_variable(x, [station_dim, time_dim]) for x in point_data_var_def},
428 )
430 return data_variables