Skip to content

Utils Module🔗

calculate_temporal_stats(df) 🔗

Calculate temporal statistics for a given DataFrame.

Source code in src/water_timeseries/utils/data.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def calculate_temporal_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate temporal statistics for a given DataFrame."""
    df = df.copy()
    df["pre_break_median"] = df["pre_break_median"].where(df["pre_break_median"] != 0, np.nan)
    df["post_break_median"] = df["post_break_median"].where(df["post_break_median"] != 0, np.nan)
    # df.dropna(subset=["pre_break_median", "post_break_median"], inplace=True)
    breaks = pd.to_datetime(df["date_break"])
    df["date_break_year"] = breaks.dt.year
    df["date_break_month"] = breaks.dt.month
    # change area ha
    df["water_change_ha"] = df["post_break_median"] - df["pre_break_median"]
    # change area perc
    df["water_change_perc"] = df["water_change_ha"].div(df["pre_break_median"].replace(0, np.nan)) * 100
    return df

create_tile_layers() 🔗

Create tile layers for folium maps.

Returns:

Type Description

List of tile layer names that can be added to folium.Map

Source code in src/water_timeseries/utils/map_styling.py
149
150
151
152
153
154
155
def create_tile_layers():
    """Create tile layers for folium maps.

    Returns:
        List of tile layer names that can be added to folium.Map
    """
    return ["CartoDB.DarkMatter", "Esri.WorldImagery"]

format_tooltip_columns(valid_gdf, id_column, tooltip_columns=None) 🔗

Format columns for tooltip display to avoid JSON serialization issues.

Parameters:

Name Type Description Default
valid_gdf

GeoDataFrame to format

required
id_column str

Name of the ID column (always shown first)

required
tooltip_columns

List of tuples (original_col, display_alias, format_string, unit) If None, uses default NetChange columns

None

Returns:

Name Type Description
formatted_gdf

GeoDataFrame with display columns added

fields_to_show

List of field names for tooltip

aliases_to_show

List of field aliases for tooltip

Source code in src/water_timeseries/utils/map_styling.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def format_tooltip_columns(
    valid_gdf,
    id_column: str,
    tooltip_columns=None,
):
    """Format columns for tooltip display to avoid JSON serialization issues.

    Args:
        valid_gdf: GeoDataFrame to format
        id_column: Name of the ID column (always shown first)
        tooltip_columns: List of tuples (original_col, display_alias, format_string, unit)
                        If None, uses default NetChange columns

    Returns:
        formatted_gdf: GeoDataFrame with display columns added
        fields_to_show: List of field names for tooltip
        aliases_to_show: List of field aliases for tooltip
    """
    import pandas as pd

    if tooltip_columns is None:
        # Default tooltip columns if NetChange data exists
        tooltip_columns = [
            ("NetChange_perc", "Net Change (%):", "{:.2f}", "%"),
            ("NetChange_ha", "Net Change (ha):", "{:.2f}", " ha"),
        ]

    # Check if we have any of the tooltip columns
    has_tooltip_data = any(col[0] in valid_gdf.columns for col in tooltip_columns)

    if has_tooltip_data:
        valid_gdf = valid_gdf.copy()
        display_columns = []
        alias_mapping = []

        for orig_col, alias, fmt, unit in tooltip_columns:
            if orig_col in valid_gdf.columns:
                display_col = f"{orig_col}_display"
                valid_gdf[display_col] = valid_gdf[orig_col].apply(
                    lambda x: f"{fmt.format(x)}{unit}" if pd.notna(x) else "N/A"
                )
                display_columns.append(display_col)
                alias_mapping.append(alias)

        # Show ID first, then formatted columns
        fields_to_show = [id_column] + display_columns
        aliases_to_show = ["ID:"] + alias_mapping
    else:
        # Fallback to ID only
        fields_to_show = [id_column]
        aliases_to_show = ["ID:"]

    return valid_gdf, fields_to_show, aliases_to_show

get_colored_style_function(color_column='NetChange_perc', vmin=-40, vmax=40, colormap=None, default_color='#cccccc', fill_opacity=0.6, edge_color='#dddddd', edge_weight=1) 🔗

Create a style function for folium polygons based on a numeric column.

Parameters:

Name Type Description Default
color_column str

Column name to use for coloring

'NetChange_perc'
vmin float

Minimum value for normalization

-40
vmax float

Maximum value for normalization

40
colormap

Matplotlib colormap (defaults to RdBu_r)

None
default_color str

Color for missing/null values

'#cccccc'
fill_opacity float

Opacity of polygon fill (0-1)

0.6
edge_color str

Color of polygon edges

'#dddddd'
edge_weight float

Width of polygon edges

1

Returns:

Name Type Description
style_function

Function that can be passed to folium.GeoJson style_function parameter

Source code in src/water_timeseries/utils/map_styling.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def get_colored_style_function(
    color_column: str = "NetChange_perc",
    vmin: float = -40,
    vmax: float = 40,
    colormap=None,
    default_color: str = "#cccccc",
    fill_opacity: float = 0.6,
    edge_color: str = "#dddddd",
    edge_weight: float = 1,
):
    """Create a style function for folium polygons based on a numeric column.

    Args:
        color_column: Column name to use for coloring
        vmin: Minimum value for normalization
        vmax: Maximum value for normalization
        colormap: Matplotlib colormap (defaults to RdBu_r)
        default_color: Color for missing/null values
        fill_opacity: Opacity of polygon fill (0-1)
        edge_color: Color of polygon edges
        edge_weight: Width of polygon edges

    Returns:
        style_function: Function that can be passed to folium.GeoJson style_function parameter
    """
    import matplotlib.pyplot as plt
    import pandas as pd

    if colormap is None:
        colormap = plt.cm.RdBu_r

    norm = plt.Normalize(vmin=vmin, vmax=vmax)

    def style_function(feature):
        props = feature.get("properties", {})
        value = props.get(color_column, None)

        if value is None or pd.isna(value):
            return {
                "fillColor": default_color,
                "color": edge_color,
                "weight": edge_weight,
                "fillOpacity": 0.5,
            }

        # Normalize value and get color from colormap
        color = colormap(norm(value))
        # Convert RGBA to hex manually to avoid JSON serialization issues
        r, g, b, a = color
        hex_color = "#{:02x}{:02x}{:02x}".format(int(r * 255), int(g * 255), int(b * 255))

        return {
            "fillColor": hex_color,
            "color": edge_color,
            "weight": edge_weight,
            "fillOpacity": fill_opacity,
        }

    return style_function

get_default_style_function(fill_color='blue', edge_color='#dddddd', edge_weight=1, fill_opacity=0.5) 🔗

Create a default style function for folium polygons.

Parameters:

Name Type Description Default
fill_color str

Fill color for all polygons

'blue'
edge_color str

Color of polygon edges

'#dddddd'
edge_weight float

Width of polygon edges

1
fill_opacity float

Opacity of polygon fill (0-1)

0.5

Returns:

Name Type Description
style_function

Function that can be passed to folium.GeoJson style_function parameter

Source code in src/water_timeseries/utils/map_styling.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def get_default_style_function(
    fill_color: str = "blue",
    edge_color: str = "#dddddd",
    edge_weight: float = 1,
    fill_opacity: float = 0.5,
):
    """Create a default style function for folium polygons.

    Args:
        fill_color: Fill color for all polygons
        edge_color: Color of polygon edges
        edge_weight: Width of polygon edges
        fill_opacity: Opacity of polygon fill (0-1)

    Returns:
        style_function: Function that can be passed to folium.GeoJson style_function parameter
    """

    def style_function(feature):
        return {
            "fillColor": fill_color,
            "color": edge_color,
            "weight": edge_weight,
            "fillOpacity": fill_opacity,
        }

    return style_function

get_water_dataset_type(input_ds) 🔗

Determine the water dataset type based on the presence of specific variables in the dataset.

Source code in src/water_timeseries/utils/data.py
21
22
23
24
25
26
27
28
29
30
def get_water_dataset_type(input_ds) -> str:
    """Determine the water dataset type based on the presence of specific variables in the dataset."""
    if "area_water_permanent" in input_ds.data_vars:
        water_dataset_type = "jrc"
    elif "water" in input_ds.data_vars:
        water_dataset_type = "dynamic_world"
    else:
        raise ValueError("Unknown water dataset type")

    return water_dataset_type

load_vector_dataset(file_path, logger=None) 🔗

Load a vector dataset from file based on file extension.

Supports GeoPackage, Shapefile, GeoJSON, and Parquet formats.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to the vector dataset file.

required
logger Optional[logger]

Optional logger instance for logging messages.

None

Returns:

Type Description
Optional[GeoDataFrame]

GeoDataFrame if successful, None otherwise.

Raises:

Type Description
FileNotFoundError

If the file does not exist.

Source code in src/water_timeseries/utils/io.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def load_vector_dataset(
    file_path: Union[str, Path],
    logger: Optional[logger] = None,
) -> Optional[gpd.GeoDataFrame]:
    """Load a vector dataset from file based on file extension.

    Supports GeoPackage, Shapefile, GeoJSON, and Parquet formats.

    Args:
        file_path: Path to the vector dataset file.
        logger: Optional logger instance for logging messages.

    Returns:
        GeoDataFrame if successful, None otherwise.

    Raises:
        FileNotFoundError: If the file does not exist.
    """
    file_path = Path(file_path)

    if not file_path.exists():
        if logger:
            logger.warning(f"Vector dataset file not found: {file_path}")
        raise FileNotFoundError(f"Vector dataset file not found: {file_path}")

    suffix = file_path.suffix.lower()

    if logger:
        logger.info(f"Loading vector dataset from {file_path}")

    # GeoPackage, Shapefile, GeoJSON formats
    if suffix in [".gpkg", ".shp", ".geojson", ".gjson"]:
        vector_ds = gpd.read_file(file_path)
    elif suffix in [".parquet"]:
        vector_ds = gpd.read_parquet(file_path)
    else:
        if logger:
            logger.warning(f"Unsupported vector file format: {suffix}")
        return None

    return vector_ds

load_xarray_dataset(path, format=None) 🔗

Load xarray dataset from file.

Parameters:

Name Type Description Default
path Union[str, Path]

Path to the dataset file.

required
format Optional[str]

Format of the file ('zarr' or 'netcdf'). If None, auto-detected from extension.

None

Returns:

Type Description
Dataset

xr.Dataset: The loaded dataset.

Raises:

Type Description
ValueError

If the file format is not supported.

Source code in src/water_timeseries/utils/io.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def load_xarray_dataset(
    path: Union[str, Path],
    format: Optional[str] = None,
) -> xr.Dataset:
    """Load xarray dataset from file.

    Args:
        path: Path to the dataset file.
        format: Format of the file ('zarr' or 'netcdf'). If None, auto-detected
            from extension.

    Returns:
        xr.Dataset: The loaded dataset.

    Raises:
        ValueError: If the file format is not supported.
    """
    path = Path(path)

    if format is None:
        ext = path.suffix.lower()
        if ext == ".zarr":
            format = "zarr"
        elif ext == ".nc":
            format = "netcdf"
        else:
            raise ValueError(f"Cannot auto-detect format for extension: {ext}")

    if format == "zarr":
        return xr.open_zarr(path)
    elif format == "netcdf":
        return xr.open_dataset(path)
    else:
        raise ValueError(f"Unsupported format: {format}. Use 'zarr' or 'netcdf'.")

save_xarray_dataset(ds, save_path, output_dir=None, logger=None) 🔗

Save xarray dataset to file.

Parameters:

Name Type Description Default
ds Dataset

The xarray dataset to save.

required
save_path Union[str, Path]

Path to save the file. Format is determined by extension: - '.zarr' for Zarr format - '.nc' for NetCDF format If a relative path is provided and output_dir is specified, the file will be saved in that directory.

required
output_dir Optional[Union[str, Path]]

Directory for relative paths. If None and save_path is relative, the current working directory is used.

None
logger

Logger for logging progress. If None, print statements are used.

None

Returns:

Name Type Description
Path Path

The resolved path where the dataset was saved.

Raises:

Type Description
ValueError

If the file extension is not supported.

Source code in src/water_timeseries/utils/io.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def save_xarray_dataset(
    ds: xr.Dataset,
    save_path: Union[str, Path],
    output_dir: Optional[Union[str, Path]] = None,
    logger=None,
) -> Path:
    """Save xarray dataset to file.

    Args:
        ds: The xarray dataset to save.
        save_path: Path to save the file. Format is determined by extension:
            - '.zarr' for Zarr format
            - '.nc' for NetCDF format
            If a relative path is provided and output_dir is specified,
            the file will be saved in that directory.
        output_dir: Directory for relative paths. If None and save_path is relative,
            the current working directory is used.
        logger: Logger for logging progress. If None, print statements are used.

    Returns:
        Path: The resolved path where the dataset was saved.

    Raises:
        ValueError: If the file extension is not supported.
    """
    path = Path(save_path)

    # Handle relative path
    if not path.is_absolute() and output_dir is not None:
        path = Path(output_dir) / path

    # Ensure parent directory exists
    path.parent.mkdir(parents=True, exist_ok=True)

    # Determine format from extension
    ext = path.suffix.lower()

    # Logging helper
    def _log(msg: str):
        if logger is not None:
            logger.info(msg)
        else:
            print(msg)

    _log(f"Saving to {ext[1:].upper()} format: {path}")

    if ext == ".zarr":
        ds.to_zarr(path, mode="w")
    elif ext == ".nc":
        ds.to_netcdf(path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}. Use '.zarr' or '.nc'.")

    _log(f"Dataset saved successfully to {path}")

    return path