'''
Dataset quality control utilities
'''
from typing import Iterable, Union
import warnings
from pathlib import Path
import rasterio as rio
from shapely import geometry as g
import geopandas as gp
import requests
import numpy as np
from operator import add
from functools import reduce
from .parallel import blocks
from .datasets.catalogue import _Resource
_LANDMASK_REF = 'https://s3.eu-central-1.wasabisys.com/eumap/lcv/lcv_land.mask_pflugmacher2019.landcover.12_f_30m_s0..0m_2014..2016_eumap_epsg3035_v0.1.tif'
def _test_field_nonempty(val):
if isinstance(val, str):
val = val.strip()
return val not in ['', []]
[docs]class Test:
"""
Class for performing QC against GH datasets.
:param bounds: Iterable of bounding coordinates to check quality within, if ``None`` the bounds of the raster will be used
:param verbose: Verbosity of the checks
:param crs: CRS of the bounding coordinates (defaults to the raster coordinates)
Examples
========
>>> from eumap.qc import Test
>>>
>>> bounds = (4751935.0, 2420238.0, 4772117.0, 2444223.0)
>>> test = Test(bounds, verbose=True)
>>> dataset_url = 'https://s3.eu-central-1.wasabisys.com/eumap/lcv/lcv_landcover.hcl_lucas.corine.rf_p_30m_0..0cm_2019_eumap_epsg3035_v0.1.tif'
>>>
>>> available = test.availability(dataset_url)
>>> coverage_fraction = test.raster_land_coverage(dataset_url, include_ice=True, include_wetlands=True)
"""
def __init__(self,
bounds: Iterable,
crs: bool=None,
verbose: bool=False,
):
self.bounds = bounds
self.verbose = verbose
self.crs = crs
[docs] def accessibility(self,
dataset_url: str,
) -> bool:
"""
Check if a remote resource is available.
:param dataset_url: URL of remote resource
:returns: True if dataset is accessible, otherwise False
"""
resp = requests.get(dataset_url, stream=True)
result = resp.status_code < 400
del resp
if self.verbose:
if result:
acc = 'accessible'
else:
acc = 'inaccessible'
print(
f'Dataset {acc}:',
dataset_url,
sep='\n',
)
return result
[docs] def raster_land_coverage(self,
dataset_path: Union[str, Path],
include_ice: bool=False,
include_wetlands: bool=False,
) -> float:
"""
Check completeness of a remote raster resource against land mask derived from [1].
:param dataset_path: Path or URL to raster dataset
:param include_ice: Include snow and ice in landmask
:param include_wetlands: Include wetlands in landmask
:returns: Fraction of pixels which are not nodata accross the landmask
References
==========
[1] `Pan-European land cover map 2015 (Pflugmacher et al., 2019) <https://doi.pangaea.de/10.1594/PANGAEA.896282>`_
"""
with rio.open(dataset_path) as src:
nodata = src.nodata
def _get_counts(landmask, data):
idx_landmask = landmask < 10
if include_ice:
idx_landmask = idx_landmask | (landmask == 12)
if include_wetlands:
idx_landmask = idx_landmask | (landmask == 11)
idx_data = (data != nodata) & idx_landmask
return np.array([
idx_data.sum(),
idx_landmask.sum(),
])
reader = blocks.RasterBlockReader(_LANDMASK_REF)
agg = blocks.RasterBlockAggregator(reader)
_crs = self.crs
if _crs is None:
_crs = reader.reference.crs
_gdf = gp.GeoDataFrame(
geometry=[g.box(*self.bounds)],
crs=_crs,
)
if _gdf.crs != reader.reference.crs:
_gdf = _gdf.to_crs(reader.reference.crs)
_geom = g.mapping(_gdf.geometry[0])
counts = agg.aggregate(
[_LANDMASK_REF, dataset_path], _geom,
block_func=_get_counts,
agg_func=lambda results: reduce(add, results),
)
result = counts[0] / counts[1]
if self.verbose:
print(
f'Completeness {result*100}% for dataset:',
dataset_path,
sep='\n',
)
return result
def metadata_consistency(self,
resource:_Resource,
) -> dict:
META_KEYS = (
'title',
'abstract',
'theme',
'authors',
)
result = []
for meta_key in META_KEYS:
try:
meta_val = resource.meta[meta_key]
assert _test_field_nonempty(meta_val)
result.append(True)
except (KeyError, AssertionError):
result.append(False)
if self.verbose:
print('Missing metadata:', meta_key)
if self.verbose:
print('All metadata present:', all(result))
return dict(zip(META_KEYS, result))