Source code for plums.dataflow.dataset.playground

from warnings import warn

import numpy as np

from plums.commons.path import Path
from plums.commons.data import (
    TileCollection,
    Annotation,
    MaskCollection,
    VectorMask,
    RecordCollection,
    Record,
    Taxonomy,
    Label,
    DataPoint
)
from .pattern import PatternDataset
from ..utils.cache import DatasetCache
from ..io import Tile, RGB, load
from ..utils.path import PathResolver


[docs]class TileDriver: """A basic driver to open `Intelligence Playground`_ tiles as |TileIO| instance. It provides a basic level of customisation but heavy modification will require either subclassing and overriding or writing a new driver altogether. Args: *names (str): Optional. If provided, it will be used a keys in the |TileCollection| returned by the driver. ptype (|ptype|): Optional. Default to ``RGB``. The image pixel-type (e.g. RGB, BGR or Grey). dtype (:class:`~numpy.dtype`): Optional. Default to :class:`~numpy.uint8`. The internal :class:`~numpy.ndarray` storage data type. fetch_ordering (bool): If ``True``, tiles will be ordered using the information stored in the dataset summary provided as a *JSON* file alongside each exports. .. warning:: If ``False`` the |TileCollection| ordering will be entirely **filesystem dependent** which is no better than random. .. _Intelligence Playground: https://playground.intelligence-airbusds.com/ """ def __init__(self, *names, ptype=RGB, dtype=np.dtype('u1'), fetch_ordering=True): # Tile format configuration self._names = names self._explicit = bool(names) self._ptype = ptype self._dtype = dtype # Tile ordering configuration self._summaries = None self._summary_resolver = None if fetch_ordering: self._summary_resolver = PathResolver('{dataset_id}/dataset_summary.json')
[docs] def __call__(self, path_tuple, **matched_groups): """Open a set of tiles in a |TileCollection|. Args: path_tuple (Tuple[PathLike]): A tuple of paths pointing to the tiles to open. **matched_groups (str): A ``group_name: value`` mapping of the *path pattern* group match in the paths. Returns: |TileCollection|: A |TileCollection| with the opened tiles. If names where provided in the constructor, they are used as key in the collection, otherwise, the default applies. Raises: ValueError: If the number of names provided in the constructor and the number of retrieved tiles mismatch. """ # If need be resolve and load summaries if self._summaries is None and self._summary_resolver is not None: root = path_tuple[0].root_to_anchor(matched_groups['dataset_id']) def _make_order_index(path): """Construct a ``zone_id: (image_id, )`` mapping from a dataset summary. Args: path (Path): A path to a *JSON* dataset summary file. Returns: dict: A ``zone_id: (image_id, )`` mapping where ``(image_id, )`` is an **ordered** tuple of image identifiers. """ summary = load(path) return {zone_id: tile_ids for zone_id, tile_ids in zip(summary['zoneIds'], summary['imageIds'])} self._summaries = \ {path.match['dataset_id']: _make_order_index(path) for path in self._summary_resolver.find(root)} if not self._summaries: raise FileNotFoundError('Invalid dataset: No file summaries could be found.') # Reorder path tuple if need be if self._summaries is not None: dataset_id = matched_groups['dataset_id'] zone_id = matched_groups['zone_id'] try: order = {image_id: i for i, image_id in enumerate(self._summaries[dataset_id][zone_id])} except KeyError: raise ValueError('Invalid dataset: Some zones or datasets seem to be missing from the summaries.') try: path_tuple = tuple(sorted(path_tuple, key=lambda path: order[path[-2]])) except KeyError: raise ValueError('Invalid dataset: Some images seem to be missing from the summaries.') # Load tiles tiles = [Tile(path, ptype=self._ptype, dtype=self._dtype, **getattr(path, 'match', {})) for path in path_tuple] if self._explicit: if len(tiles) != len(self._names): raise ValueError('The number of tiles is incompatible with the provided number ' 'of names: {} != {}.'.format(len(tiles), len(self._names))) return TileCollection(*((name, tile) for name, tile in zip(self._names, tiles))) return TileCollection(*tiles)
[docs]class AnnotationDriver: """A basic driver to open `Intelligence Playground`_ annotation GeoJSON **FeatureCollection** as |Annotation|. It provides a basic level of customisation but heavy modification will require either subclassing and overriding or writing a new driver altogether. Args: record_id_key (str): The key used to find a record's unique identifier in its ``properties`` mapping. confidence_key (str): The key used to find a record's confidence score in its ``properties`` mapping. taxonomy (Taxonomy): If provided, a |Taxonomy| against which all records' labels will be validated. cache (bool): Optional. Default to ``False``. If ``True``, all constructed |Annotation| will be cached in memory to speed up future retrieval. .. _Intelligence Playground: https://playground.intelligence-airbusds.com/ """ def __init__(self, record_id_key='record_id', confidence_key='confidence', taxonomy=None, cache=False): self.taxonomy = taxonomy self._record_id_key = record_id_key self._confidence_key = confidence_key self._cache = cache self._memcache = {} @staticmethod def _cleanup(feature, key): if key in feature['properties']: # Cleanup properties feature['properties'].pop(key)
[docs] def __call__(self, path_tuple, **matched_groups): """Open a *Playground* annotation GeoJSON file as an |Annotation|. Args: path_tuple (Tuple[PathLike]): A tuple containing a single path pointing to a valid GeoJSON file. **matched_groups (str): A ``group_name: value`` mapping of the *path pattern* group match in the paths. Returns: |Annotation|: An |Annotation| with |Record| in the tile and a |VectorMask| corresponding to the *zone* footprint in the tile. Raises: ValueError: If no valid |Annotation| could be constructed from the opened JSON file. ValueError: If more than one path was provided. """ if len(path_tuple) >= 2: raise ValueError('More than one annotation file was provided.') # If cache is enabled, try retrieving from cache if self._cache: annotation = self._memcache.get(path_tuple, None) if annotation is not None: return annotation # Load annotation feature_collection = load(path_tuple[0]) # +-> Prepare defaults values data_mask = VectorMask([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]], 'zone_footprint', mask=True) record_collection = RecordCollection(taxonomy=self.taxonomy) # +-> Iterate over features for feature in feature_collection['features']: # +-> If found, retrieve zone footprint mask if feature['properties'].get('mask') is not None: data_mask = VectorMask(feature['geometry']['coordinates'], 'zone_footprint', mask=True) continue # +-> Get record coordinates coordinates = feature['geometry']['coordinates'] # +-> Get record labels labels = feature['properties'].pop('tags') # +-> Get record confidence confidence = feature['properties'].pop(self._confidence_key, None) # +-> Get record identifier id_ = feature['properties'].pop(self._record_id_key, None) # Cleanup self._cleanup(feature, 'coordinates') self._cleanup(feature, 'labels') self._cleanup(feature, 'confidence') self._cleanup(feature, 'id') self._cleanup(feature, 'taxonomy') record_collection.append(Record(coordinates, labels, confidence, id_, **feature['properties'])) annotation = Annotation(record_collection, mask_collection=MaskCollection(data_mask), filename=path_tuple[0]) # If cache is enabled, store annotation in cache if self._cache: self._memcache[path_tuple] = annotation return annotation
[docs]class TaxonomyReader: """A callable class which loads and constructs a |Taxonomy| when provided with a valid Playground dataset path.""" def _make_tree(self, root, tree_descriptor): """Recursively descend a dictionary tree and create a tree. Args: root (Label): The current tree root. tree_descriptor (dict): The current dictionary tree. Returns: Label: The current tree root |Label|. """ for leaf, value in tree_descriptor.items(): leaf = Label(leaf) root.add(leaf) if value is not None: self._make_tree(leaf, value) return root
[docs] def __call__(self, path): """Construct a |Taxonomy| from the exported dataset `taxonomy.json` file. Args: path (PathLike): A path to a single Playground dataset. Returns: Taxonomy: The dataset taxonomy. """ path = Path(path) taxonomy_descriptor = load(path / 'taxonomy.json') taxonomy = Taxonomy() self._make_tree(taxonomy.root, taxonomy_descriptor) return taxonomy
[docs]class PlaygroundDataset(PatternDataset): """A |Dataset| as exported by the `Intelligence Playground`_ which loads data in the *Plums* data model. A |PlaygroundDataset| has the following file structure: :: ├── <dataset_id_1> │ ├── samples │ │ ├── <zone_id_1> │ │ │ ├── <image_id_1> │ │ │ │ ├── <tile_id>.jpg │ │ │ │ └── ... │ │ │ ├── <image_id_2> │ │ │ │ ├── <tile_id>.jpg │ │ │ │ └── ... │ │ │ └── ... │ │ ├── <zone_id_2> │ │ │ ├── samples │ │ │ └── ... │ │ └── ... │ └── labels │ ├── <zone_id_1> │ │ ├── <tile_id>.json │ │ └── ... │ ├── <zone_id_2> │ │ ├── <tile_id>.json │ │ └── ... │ └── ... ├── <dataset_id_2> │ └── ... └── ... Where samples are projected jpg tiles of imagery and annotation are a *geojson* **FeatureCollection**. .. hint:: The constructor arguments allows for explicit selection of *datasets*, *zones*, *images* or *tiles* and explicit exclusion of *datasets*, *zones*, *images* or *tiles* by providing list of identifiers to select or exclude. If no such sequence or provided, valid data point will be automatically discovered from the filesystem. Args: path (PathLike): The path path to the dataset root, it may be either absolute or relative to the current working directory. select_datasets (Sequence[str]): Optional. If provided, it must be a sequence of uuid used to select the datasets in which data points will be fetched. exclude_datasets (Sequence[str]): Optional. If provided, it must be a sequence of uuid used to excludes datasets from the data point search. select_zones (Sequence[str]): Optional. If provided, it must be a sequence of uuid used to select the zones in which data points will be fetched. exclude_zones (Sequence[str]): Optional. If provided, it must be a sequence of uuid used to excludes zones from the data point search. select_images (Sequence[str]): Optional. If provided, it must be a sequence of identifiers used to select the images in which data points will be fetched. exclude_images (Sequence[str]): Optional. If provided, it must be a sequence of identifiers used to excludes images from the data point search. select_tiles (Sequence[str]): Optional. If provided, it must be a sequence of identifiers used to select the tiles which will be fetched. exclude_tiles (Sequence[str]): Optional. If provided, it must be a sequence of identifiers used to excludes tiles from the data point search. tile_driver (callable): Optional. Default to a |TileDriver|. A ``function(path_tuple, **matched_groups)`` callable which return a |TileCollection|-like object called for each data point (see :ref:`drivers`). annotation_driver (callable): Optional. Default to a |AnnotationDriver|. A ``function(path_tuple, **matched_groups)`` callable which return an |Annotation|-like object called for each data point (see :ref:`drivers`). use_taxonomy (bool): Optional. Default to ``True``. If ``False``, the global taxonomy will not be passed to the annotation driver and implicit taxonomies for each annotation files, with no interplay guarantee. strict (bool): If ``False``, solitary tiles or annotations will be silently dropped instead of raising. cache (bool): If ``True``, the dataset will be looked-up in the user's cache directory and if found loaded from there instead of walking the file-system. Note that although this could speedup dataset loading multiple fold for big datasets, one may load stale data when using the cache. Warnings: If providing a custom annotation driver, the ``use_taxonomy`` flag is not guaranteed to work and it is up to the provided driver to handle dataset taxonomies if needed (See also the |TaxonomyReader| helper class). Raises: ValueError: If the requested playground datasets have mismatching taxonomies and global |Taxonomy| usage was requested. ValueError: If tile could not be matched to an annotation and ``strict`` is ``True``. ValueError: If no tile/annotation pair could be found. Warns: UserWarning: If the requested playground datasets have mismatching taxonomies and global |Taxonomy| usage was not requested. .. _Intelligence Playground: https://playground.intelligence-airbusds.com/ """ _cache = DatasetCache('playground') # For easy subclass prefix selection. def __init__(self, path, select_datasets=(), select_zones=(), select_images=(), select_tiles=(), exclude_datasets=(), exclude_zones=(), exclude_images=(), exclude_tiles=(), tile_driver=None, annotation_driver=None, use_taxonomy=True, strict=True, cache=False): # Build eventual regular expressions from include and exclude sequences # +-> Dataset: dataset_regex = r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}' if select_datasets: dataset_regex = r'(?:{})'.format('|'.join(select_datasets)) if exclude_datasets: dataset_regex = '{exclude}{regex}'.format(exclude=r'(?!{})'.format('|'.join(exclude_datasets)), regex=dataset_regex) # +-> Zone: zone_regex = r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}' if select_zones: zone_regex = r'(?:{})'.format('|'.join(select_zones)) if exclude_zones: zone_regex = '{exclude}{regex}'.format(exclude=r'(?!{})'.format('|'.join(exclude_zones)), regex=zone_regex) # +-> Image: image_regex = r'[^/]+' if select_images: image_regex = r'(?:{})'.format('|'.join(select_images)) if exclude_images: image_regex = '{exclude}{regex}'.format(exclude=r'(?!{})'.format('|'.join(exclude_images)), regex=image_regex) # +-> Tile: tile_regex = r'[0-9a-f]{32}' if select_tiles: tile_regex = r'(?:{})'.format('|'.join(select_tiles)) if exclude_tiles: tile_regex = '{exclude}{regex}'.format(exclude=r'(?!{})'.format('|'.join(exclude_tiles)), regex=tile_regex) # Build path patterns # +-> Tile: tile_pattern = '{{dataset_id:{dataset_regex}}}/' \ 'samples/' \ '{{zone_id:{zone_regex}}}/' \ '{{image_id:{image_regex}}}/' \ '{{tile_id:{tile_regex}}}.jpg'.format(dataset_regex=dataset_regex, zone_regex=zone_regex, image_regex=image_regex, tile_regex=tile_regex) # +-> Annotation: annotation_pattern = '{{dataset_id:{dataset_regex}}}/' \ 'labels/' \ '{{zone_id:{zone_regex}}}/' \ '{{tile_id:{tile_regex}}}.json'.format(dataset_regex=dataset_regex, zone_regex=zone_regex, tile_regex=tile_regex) # Initialize dataset super(PlaygroundDataset, self).__init__(tile_pattern=tile_pattern, annotation_pattern=annotation_pattern, tile_driver=TileDriver() if tile_driver is None else tile_driver, annotation_driver=AnnotationDriver() if annotation_driver is None else annotation_driver, path=path, strict=strict, sort_key=lambda group: group, cache=cache) # Load taxonomies and attach it to the annotation driver if needed path = Path(path) taxonomy_reader = TaxonomyReader() reference_taxonomy = None reference_dataset = None for dataset_id in {group[0] for group in self._group_index}: try: taxonomy = taxonomy_reader(path / dataset_id) except FileNotFoundError: raise FileNotFoundError('Invalid dataset: No taxonomy could be found in {}.'.format(path / dataset_id)) if reference_taxonomy is None: reference_dataset = dataset_id reference_taxonomy = taxonomy if reference_taxonomy != taxonomy: if use_taxonomy: raise ValueError('Some datasets have mismatching taxonomies:\n' 'Dataset {}:\n' '{}\n' 'is different from\n' 'Dataset {}:\n' '{}'.format(reference_dataset, reference_taxonomy, dataset_id, taxonomy)) else: warn('Some datasets have mismatching taxonomies:\n' 'Dataset {}:\n' '{}\n' 'is different from\n' 'Dataset {}:\n' '{}'.format(reference_dataset, reference_taxonomy, dataset_id, taxonomy), UserWarning) if use_taxonomy: self._annotation_driver.taxonomy = reference_taxonomy def __getitem__(self, item): """Read and return the i-th |DataPoint| of the |PlaygroundDataset|. Args: item (int): The |DataPoint| index in the dataset. Returns: DataPoint: The dataset i-th entry. """ # Fetch match group = self._group_index[item] match = {name: value for name, value in zip(self._matching_groups, group)} # Fetch data point data_point = super(PlaygroundDataset, self).__getitem__(item) data_point.dataset_id = match['dataset_id'] data_point.zone_id = match['zone_id'] data_point.tile_id = match['tile_id'] return data_point