Source code for watershed_workflow.sources.manager_soilgrids_2017

"""Manager for downloading SoilGrids250m-2017 products."""

from typing import List
import os
import logging
import numpy as np
import shapely
import rasterio
import rasterio.mask
import xarray as xr
import rioxarray
import cftime

import watershed_workflow.warp
import watershed_workflow.crs
from watershed_workflow.crs import CRS

from . import utils as source_utils
from . import filenames
from . import manager_dataset



[docs]
class ManagerSoilGrids2017(manager_dataset.ManagerDataset):
    """SoilGrids 250m (2017) datasets.

    SoilGrids 2017 maintains, to date, the only complete
    characterization of all soil properties needed for a hydrologic
    model.  The resolution is decent, and the accuracy is ok, but most
    importantly it is complete.

    .. [SoilGrids2017] https://www.isric.org/explore/soilgrids/faq-soilgrids-2017

    .. [hengl2014soilgrids] Hengl, Tomislav, et al. "SoilGrids1km—global soil information based on automated mapping." PloS one 9.8 (2014): e105992.

    .. [hengl2017soilgrids] Hengl, Tomislav, et al. "SoilGrids250m: Global gridded soil information based on machine learning." PLoS one 12.2 (2017): e0169748.
    
    See the above link for a complete listing of potential variable
    names; included here are a subset used by this code.  That said,
    any 2017 filename can be used with this source manager.

    Variables available with layer information:
    
    - BLDFIE_layer_1 through BLDFIE_layer_7: Bulk density of fine earth [kg m^-3]
    - CLYPPT_layer_1 through CLYPPT_layer_7: Percent clay [%]
    - SLTPPT_layer_1 through SLTPPT_layer_7: Percent silt [%] 
    - SNDPPT_layer_1 through SNDPPT_layer_7: Percent sand [%]
    - WWP_layer_1 through WWP_layer_7: Soil water capacity % at wilting point [%]
    - BDTICM: Absolute depth to continuous, unfractured bedrock [cm]
    """
    URL = "https://files.isric.org/soilgrids/former/2017-03-10/data/"
    DEPTHS = [0, 0.05, 0.15, 0.3, 0.6, 1.0, 2.0]
    BASE_VARIABLES = ['BLDFIE', 'CLYPPT', 'SLTPPT', 'SNDPPT', 'WWP']
    BEDROCK_VARIABLE = 'BDTICM'
    LAYERS = list(range(1, 8))  # 1 through 7

    def __init__(self, variant: str | None = None):
        # Create all valid variable combinations
        valid_variables = [self.BEDROCK_VARIABLE]  # BDTICM has no layers
        for base_var in self.BASE_VARIABLES:
            for layer in self.LAYERS:
                valid_variables.append(f'{base_var}_layer_{layer}')
        
        # Set up names helper
        if variant == 'US':
            name = 'SoilGrids2017_US'
            self.names = filenames.Names(
                name, 'soil_structure', name, '{variable}_M_{soillevel}250m_ll_us.tif')
        else:
            name = 'SoilGrids2017'
            self.names = filenames.Names(
                name, 'soil_structure', name, '{variable}_M_{soillevel}250m_ll.tif')
        
        # Initialize ManagerDataset
        super().__init__(
            name=name,
            source=self.URL,
            native_resolution=250.0 / 111320.0,  # 250m converted to degrees (approx)
            native_crs_in=watershed_workflow.crs.from_epsg(4326),
            native_crs_out=watershed_workflow.crs.from_epsg(4326),
            native_start=None,
            native_end=None,
            valid_variables=valid_variables,
            default_variables=[self.BEDROCK_VARIABLE]
        )

    def _requestDataset(self, request: manager_dataset.ManagerDataset.Request
                        ) -> manager_dataset.ManagerDataset.Request:
        """Download all required files for the request.
        
        Parameters
        ----------
        request : ManagerDataset.Request
            The dataset request
            
        Returns
        -------
        ManagerDataset.Request
            The request with is_ready set to True
        """
        # Download all required files
        for var_name in request.variables:
            base_var, layer = self._parseVariable(var_name)
            self._downloadFile(base_var, layer)
        
        # Set ready since downloads are immediate
        request.is_ready = True
        return request
    
    def _fetchDataset(self, request: manager_dataset.ManagerDataset.Request) -> xr.Dataset:
        """Fetch the dataset for the request.
        
        Parameters
        ----------
        request : ManagerDataset.Request
            The dataset request
            
        Returns
        -------
        xr.Dataset
            Dataset containing the requested variables
        """
        data_arrays = {}
        bounds = request.geometry.bounds
        
        for var_name in request.variables:
            base_var, layer = self._parseVariable(var_name)
            
            # Get filename using the names helper
            if layer is None:
                soillevel = ''
            else:
                soillevel = f'sl{layer}_'
            filename = self.names.file_name(variable=base_var, soillevel=soillevel)
            
            # Load raster and clip to bounds
            dataset = rioxarray.open_rasterio(filename, cache=False)
            dataset = dataset.rio.clip_box(*bounds, crs=watershed_workflow.crs.to_rasterio(self.native_crs_in))
            
            # Convert to DataArray (remove band dimension if single band)
            if len(dataset.shape) > 2:
                da = dataset[0, :, :]  # Take first band
            else:
                da = dataset
            
            # Set variable name
            da.name = var_name
            
            # Add units and description as attributes
            if base_var == 'BDTICM':
                da.attrs['units'] = 'cm'
                da.attrs['description'] = 'Absolute depth to continuous, unfractured bedrock'
            elif base_var == 'BLDFIE':
                da.attrs['units'] = 'kg m^-3'
                da.attrs['description'] = 'Bulk density of fine earth'
            elif base_var in ['CLYPPT', 'SLTPPT', 'SNDPPT', 'WWP']:
                da.attrs['units'] = '%'
                if base_var == 'CLYPPT':
                    da.attrs['description'] = 'Percent clay'
                elif base_var == 'SLTPPT':
                    da.attrs['description'] = 'Percent silt'
                elif base_var == 'SNDPPT':
                    da.attrs['description'] = 'Percent sand'
                elif base_var == 'WWP':
                    da.attrs['description'] = 'Soil water capacity % at wilting point'
            
            if layer is not None:
                da.attrs['layer'] = layer
            
            data_arrays[var_name] = da
        
        # Create Dataset
        result_dataset = xr.Dataset(data_arrays)
        
        return result_dataset

    def _parseVariable(self, var_name: str) -> tuple[str, int | None]:
        """Parse a variable name to extract base variable and layer.
        
        Parameters
        ----------
        var_name : str
            Variable name (e.g., 'BLDFIE_layer_3' or 'BDTICM')
            
        Returns
        -------
        tuple[str, int | None]
            Base variable name and layer number (None for BDTICM)
        """
        if var_name == self.BEDROCK_VARIABLE:
            return var_name, None
        
        if '_layer_' in var_name:
            parts = var_name.split('_layer_')
            if len(parts) == 2 and parts[0] in self.BASE_VARIABLES:
                try:
                    layer = int(parts[1])
                    if layer in self.LAYERS:
                        return parts[0], layer
                except ValueError:
                    pass
        
        raise ValueError(f"Invalid variable name: {var_name}")

    def _downloadFile(self, base_variable: str, layer: int | None, force: bool = False) -> str:
        """Download a file if it doesn't exist.
        
        Parameters
        ----------
        base_variable : str
            Base variable name
        layer : int | None
            Layer number (None for bedrock)
        force : bool
            Force re-download if True
            
        Returns
        -------
        str
            Path to the downloaded file
        """
        os.makedirs(self.names.folder_name(), exist_ok=True)
        
        if layer is None:
            soillevel = ''
        else:
            soillevel = f'sl{layer}_'
        
        filename = self.names.file_name(variable=base_variable, soillevel=soillevel)
        
        if not os.path.exists(filename) or force:
            filename_base = self.names.file_name_base(variable=base_variable, soillevel=soillevel)
            url = self.URL + filename_base
            source_utils.download(url, filename, force)
        
        return filename