Skip to content

preprocess_toolbox.dataset.time


process_missing_dates(ds, ds_config, variable, end_date=None, start_date=None)

TODO: we need to be able to add more missing dates as detected spatially (full of nans) TODO: we should be limiting interpolation or doing the above when over n steps

Args: ds: ds_config: variable: end_date: start_date:

Returns:

Source code in preprocess_toolbox/dataset/time.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def process_missing_dates(ds: xr.Dataset,
                          ds_config: DatasetConfig,
                          variable: str,
                          end_date: dt.date = None,
                          start_date: dt.date = None):
    """

    TODO: we need to be able to add more missing dates as detected spatially (full of nans)
    TODO: we should be limiting interpolation or doing the above when over n steps

    Args:
        ds:
        ds_config:
        variable:
        end_date:
        start_date:

    Returns:

    """
    da = getattr(ds, variable)
    da = da.sortby('time')

    dates_obs = [pd.to_datetime(date).date() for date in da.time.values]
    dates_all = [pd.to_datetime(date).date() for date in
                 pd.date_range(min(dates_obs) if not start_date else start_date,
                               max(dates_obs) if not end_date else end_date,
                               freq="1{}".format(ds_config.frequency.freq))]

    invalid_dates = list() if not hasattr(ds_config, "invalid_dates") else ds_config.invalid_dates
    drop_dates = [pd.Timestamp(el) for el in invalid_dates if pd.Timestamp(el) in da.time.values]
    da = da.drop_sel(time=drop_dates)
    missing_dates = [date for date in dates_all
                     if date not in dates_obs or date in invalid_dates]

    logging.info("Interpolating {} missing dates".format(len(missing_dates)))

    for date in missing_dates:
        if pd.Timestamp(date) not in da.time.values:
            logging.info("Interpolating {}".format(date))
            da = xr.concat([da,
                            da.interp(time=pd.to_datetime(date))],
                           dim='time')

    logging.debug("Finished interpolation")

    da = da.sortby('time')
    ds[variable] = da
    return ds