Source code for plums.plot.engine.descriptor

import re
import abc

import schema
import numpy as np
from ordered_set import OrderedSet

from plums.plot.engine.utils import dict_equal
from plums.plot.engine.color import Color
from plums.plot.engine.color.color_map import ColorMap, identity

_camel_to_snake_re1 = re.compile('(.)([A-Z][a-z]+)')
_camel_to_snake_re2 = re.compile('([a-z0-9])([A-Z])')


def _camel_to_snake(camel_cased_name):
    s1 = _camel_to_snake_re1.sub(r'\1_\2', camel_cased_name)
    return _camel_to_snake_re2.sub(r'\1_\2', s1).lower()


_continuous_schema = schema.Schema(
    schema.Or(
        schema.And((schema.Or(int, float), ), lambda x: len(x) == 2),
        ColorMap
    )
)

_categorical_schema = schema.Schema(
    schema.Or(
        {
            str: schema.Or(int,
                           float,
                           Color,
                           _continuous_schema,
                           {
                               str: schema.Or(int, float, Color, _continuous_schema)
                           })
        }, {})
)


[docs]class Descriptor(object, metaclass=abc.ABCMeta): """Abstract base class for all |Descriptor|. A |Descriptor| is a class used to extract relevant information from a collection of |Record|. More specifically, a |Descriptor| is used in two phases: * In a first time, optional internals are updated and/or accumulated over the entire collection of |RecordCollection| tuples. Those internals will be used to tailor the |Descriptor| to the description task specific context (*e.g.* like updating a range of possible value to later normalize or clip a property value). * In a second time, the descriptor is used to construct a new |RecordCollection| or a tuple of |RecordCollection| where each enclosed |Record| is added a named property containing the adequate description of the |Record| from a tuple of |RecordCollection|. Subclasses must override the two interfaces used to do so: :meth:`update(*record_collections) <update>` and :meth:`compute(*record_collections) <compute>` as well as the :meth:`reset` method and the :meth:`_make_interface` private method. Args: name (str): The |Descriptor| name. Attributes: name (str): The |Descriptor| name. """ __interface_schema__ = schema.Schema( schema.Or( { 'name': schema.Regex(r'^[(),a-zA-Z_ ]+$'), 'type': 'categorical', 'property': schema.Regex(r'^[a-z][a-z_]+[a-z]$'), 'schema': _categorical_schema }, { 'name': schema.Regex(r'^[(),a-zA-Z_ ]+$'), 'type': 'continuous', 'property': schema.Regex(r'^[a-z][a-z_]+[a-z]$'), 'schema': _continuous_schema } ) ) def __init__(self, name): self.name = name @property def property_name(self): """str: The |Record| inserted property name after :meth:`compute` call.""" return '_'.join((_camel_to_snake(self.__class__.__name__), _camel_to_snake(self.name))) @property def __descriptor__(self): # noqa: D401 """dict: A dictionary which summarizes the internal state of the |Descriptor| instance. Keys: name (str): The descriptor name. Note that in some |Descriptor|, it conjointly designates the descriptor name **and** the name of the property read from each |Record| to describe it. property (str): The name of the property written out by the |Descriptor| in each |Record|. type (str): Either ``categorical`` or ``continuous``. schema (dict, tuple, ColorMap): For ``categorical`` descriptors, it must be a dict mapping each ``category name`` to its relevant value (*e.g.* its index or |Color|). For ``continuous`` descriptors, it must be a range description (*i.e.* either a ``(start, end)`` tuple or a object with a ``range`` attribute like a |ColorMap|). See Also: For more information on how :attr:`__descriptor__` is constructed, please see :ref:`descriptor`. """ interface = self._make_interface() interface.update({'name': self.name.title().replace('_', ' '), 'property': self.property_name}) try: self.__interface_schema__.validate(interface) except schema.SchemaError as e: raise ValueError('Invalid __descriptor__ returned:\n' '{}\n{}'.format(e.errors, e.autos)) return interface def __repr__(self): """Return a python representation of the |Descriptor|.""" return '{}(name={})'.format(self.__class__.__name__, self.name) def __str__(self): """Return the |Descriptor| representation.""" return self.__repr__() def __eq__(self, other): """Return ``True`` if two |Descriptor| have the same attributes and class.""" if self.__class__ == other.__class__: return dict_equal(self.__dict__, other.__dict__) return False def __ne__(self, other): """Return ``True`` if two |Descriptor| do not have the same attributes and class.""" return not self == other
[docs] @abc.abstractmethod def update(self, *record_collections): """Update internal values from |Record| descriptions. Args: *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update internals. """ raise NotImplementedError
[docs] @abc.abstractmethod def compute(self, *record_collections): """Construct new |RecordCollection| where each enclosed |Record| is added a named description property. Args: *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with described |Record|. Returns: (|RecordCollection|, ): A described |RecordCollection| tuple. """ raise NotImplementedError
[docs] @abc.abstractmethod def reset(self): """Reset |Descriptor| internals to factory values.""" raise NotImplementedError
@abc.abstractmethod def _make_interface(self): """Build a summary of hte internal state of the |Descriptor|. It is used as a unified interface for all descriptors to describe and explicit their description. As an example, for a typical |CategoricalDescriptor|, the interface dictionary would contain: * The |Descriptor| name * The |Descriptor| property name * The |Descriptor| type, *i.e.* ``'categorical'`` here * The category name to category index mapping. As for a typical |ContinuousDescriptor|, it would contain: * The |Descriptor| name * The |Descriptor| property name * The |Descriptor| type, *i.e.* ``'continuous'`` here * The value range. See Also: The :attr:`__descriptor__` for more information. Returns: dict: A valid |Descriptor| :attr:`__descriptor__`. """ raise NotImplementedError
class CategoricalDescriptor(Descriptor): """A |Descriptor| used to extract a categorical property from a collection of |Record|. Args: name (str): The |Record| property to describe name. fetch_fn (Callable): Optional. Default to identity. An optional function applied to the |Record| property before it is counted in a category. Attributes: name (str): The |Record| property to describe name. """ def __init__(self, name, fetch_fn=None): super(CategoricalDescriptor, self).__init__(name) self._categories = OrderedSet() self._fetch_fn = fetch_fn or identity def update(self, *record_collections): """Update the set of known categories from |Record| property :attr:`name` value. Args: *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update set of known categories. """ records = (record for record_collection in record_collections for record in record_collection) try: for record in records: self._categories.add(str(self._fetch_fn(getattr(record, self.name)))) except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) def compute(self, *record_collections): """Construct new |RecordCollection| where each enclosed |Record| is added a category number as a property. Args: *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with described |Record|. Returns: (|RecordCollection|, ): A described |RecordCollection| tuple. """ for record_collection in record_collections: for record in record_collection: try: record.properties[self.property_name] = \ self._categories.index(str(self._fetch_fn(getattr(record, self.name)))) + 0.5 except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) except KeyError: raise ValueError('Invalid record property value: ' '{} is out of known property range of values.'.format(getattr(record, self.name))) return record_collections def reset(self): """Reset |CategoricalDescriptor| set of known categories to factory values.""" self._categories = OrderedSet() def _make_interface(self): return { 'type': 'categorical', 'schema': {category: index + 0.5 for index, category in enumerate(self._categories)} } class ContinuousDescriptor(Descriptor): """A |Descriptor| used to extract a continuous property from a collection of |Record| with an optional scaling. Args: name (str): The |Record| property to describe name. scale (tuple): Optional. Default to ``None``. If given, it must be an interval :math:`[a, b]` over which the extracted property will be scaled. data_range(tuple): Optional. Default to ``None``. If provided, it is used to initialise internal start and end values instead of ``(0, 0)`` Attributes: name (str): The |Record| property to describe name. scale (tuple): An interval :math:`[a, b]` over which the extracted property will be scaled. """ def __init__(self, name, scale=None, data_range=None, fetch_fn=None): super(ContinuousDescriptor, self).__init__(name) if data_range: self._start = data_range[0] self._end = data_range[1] else: self._start = float('Inf') self._end = -float('Inf') self._data_range = data_range self.scale = scale self._fetch_fn = fetch_fn or identity def update(self, *record_collections): """Update the know descriptor range from |Record| property :attr:`name` value. Args: *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update set of known categories. """ records = (record for record_collection in record_collections for record in record_collection) try: for record in records: property_ = self._fetch_fn(getattr(record, self.name)) self._start = min(self._start, property_) self._end = max(self._end, property_) except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) def compute(self, *record_collections): """Construct new |RecordCollection| where each enclosed |Record| is added the descriptor value as a property. Args: *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with described |Record|. Returns: (|RecordCollection|, ): A described |RecordCollection| tuple. """ for record_collection in record_collections: for record in record_collection: try: property_ = self._fetch_fn(getattr(record, self.name)) if property_ > self._end or property_ < self._start: raise ValueError('Invalid record property value: ' '{} is out of known property range ' '[{}, {}].'.format(getattr(record, self.name), self._start, self._end)) if self.scale: property_ = ((float(property_) - self._start) / (self._end - self._start)) * \ (self.scale[1] - self.scale[0]) + self.scale[0] record.properties[self.property_name] = property_ except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) return record_collections def reset(self): """Reset |Descriptor| know descriptor range to factory values.""" if self._data_range: self._start = self._data_range[0] self._end = self._data_range[1] else: self._start = float('Inf') self._end = -float('Inf') def _make_interface(self): return { 'type': 'continuous', 'schema': (self._start, self._end) } class IntervalDescriptor(CategoricalDescriptor): """A |CategoricalDescriptor| used to extract a continuous property from a collection of |Record| into a category. Args: name (str): The |Record| property to describe name. n (int): Optional. Default to ``5``.The number of bins used to categorise the continuous property. data_range(tuple): Optional. Default to ``None``. If provided, it is used to initialise internal start and end values instead of ``(0, 0)`` Attributes: name (str): The |Record| property to describe name. n (str): The number of bins used to categorise the continuous property. """ def __init__(self, name, n=5, data_range=None, fetch_fn=None): super(CategoricalDescriptor, self).__init__(name) self.n = n if data_range: self._start = data_range[0] self._end = data_range[1] else: self._start = float('Inf') self._end = -float('Inf') self._data_range = data_range self._fetch_fn = fetch_fn or identity @property def _range(self): return self._end - self._start @property def _step(self): return max(round(self._range / float(self.n), 8), 1e-10) def _get_interval_bound(self, i): if i < self.n: return self._start + i * self._step elif i == self.n: return self._end else: raise IndexError('Out of bound index {} to get interval bound value.'.format(i)) @property def _categories(self): return tuple('[{:.2f}, {:.2f}['.format(self._get_interval_bound(i), self._get_interval_bound(i + 1)) for i in range(self.n)) def update(self, *record_collections): """Update the know descriptor range from |Record| property :attr:`name` value. Args: *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update set of known categories. """ records = (record for record_collection in record_collections for record in record_collection) try: for record in records: property_ = self._fetch_fn(getattr(record, self.name)) self._start = min(self._start, property_) self._end = max(self._end, property_ + 1e-8) except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) def compute(self, *record_collections): """Construct new |RecordCollection| where each enclosed |Record| is added a category number as a property. Args: *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with described |Record|. Returns: (|RecordCollection|, ): A described |RecordCollection| tuple. """ for record_collection in record_collections: for record in record_collection: try: property_ = self._fetch_fn(getattr(record, self.name)) if property_ > self._end or property_ < self._start: raise ValueError('Invalid record property value: ' '{} is out of known property range ' '[{}, {}].'.format(getattr(record, self.name), self._start, self._end)) record.properties[self.property_name] = min(int((property_ - self._start) // self._step), len(self._categories) - 1) + 0.5 except AttributeError: raise ValueError('Invalid record property name: {} was not found in record.'.format(self.name)) return record_collections def reset(self): """Reset |Descriptor| know descriptor range to factory values.""" if self._data_range: self._start = self._data_range[0] self._end = self._data_range[1] else: self._start = float('Inf') self._end = -float('Inf') class Labels(CategoricalDescriptor): """Extract a categorical property from :attr:`~plums.commons.data.record.Record.labels`.""" def __init__(self): super(Labels, self).__init__(name='labels', fetch_fn=self._fetch_fn) @staticmethod def _fetch_fn(labels): return ', '.join([str(e) for e in labels]) class Confidence(ContinuousDescriptor): """Extract a continuous property from :attr:`~plums.commons.data.record.Record.confidence`. Args: scale (tuple): Optional. Default to ``None``. If given, it must be an interval :math:`[a, b]` over which the extracted property will be scaled. """ def __init__(self, scale=None): super(Confidence, self).__init__(name='confidence', scale=scale, data_range=(0, 1)) class IntervalConfidence(IntervalDescriptor): """Extract a interval property from :attr:`~plums.commons.data.record.Record.confidence`. Args: n (int): Optional. Default to ``5``.The number of bins used to categorise the continuous property. """ def __init__(self, n=5): super(IntervalConfidence, self).__init__(name='confidence', data_range=(0, 1), n=n) def _shoelace_formula(x, y): return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) def _area(coordinates): coordinates = tuple(coordinates) outer = np.array(coordinates[0]) inners = None if len(coordinates) > 1: inners = [np.array(coordinates[i]) for i in range(1, len(coordinates))] area = _shoelace_formula(outer[:, 0], outer[:, 1]) if inners: for inner in inners: area -= _shoelace_formula(inner[:, 0], inner[:, 1]) return area class Area(ContinuousDescriptor): """Extract an area property from :attr:`~plums.commons.data.record.Record.coordinates`. Args: scale (tuple): Optional. Default to ``None``. If given, it must be an interval :math:`[a, b]` over which the extracted property will be scaled. """ def __init__(self, scale=None): super(Area, self).__init__(name='coordinates', scale=scale, fetch_fn=_area) @property def __descriptor__(self): # noqa: D401 """dict: A dictionary which summarizes the internal state of the |Descriptor| instance. Keys: name (str): The descriptor name. Note that in some |Descriptor|, it conjointly designates the descriptor name **and** the name of the property read from each |Record| to describe it. property (str): The name of the property written out by the |Descriptor| in each |Record|. type (str): Either ``categorical`` or ``continuous``. schema (dict, tuple, ColorMap): For ``categorical`` descriptors, it must be a dict mapping each ``category name`` to its relevant value (*e.g.* its index or |Color|). For ``continuous`` descriptors, it must be a range description (*i.e.* either a ``(start, end)`` tuple or a object with a ``range`` attribute like a |ColorMap|). See Also: For more information on how :attr:`__descriptor__` is constructed, please see :ref:`descriptor`. """ interface = self._make_interface() interface.update({'name': 'Area', 'property': self.property_name}) try: self.__interface_schema__.validate(interface) except schema.SchemaError as e: raise ValueError('Invalid __descriptor__ returned:\n' '{}\n{}'.format(e.errors, e.autos)) return interface class IntervalArea(IntervalDescriptor): """Extract an area property from :attr:`~plums.commons.data.record.Record.coordinates`. Args: n (int): Optional. Default to ``5``.The number of bins used to categorise the continuous property. """ def __init__(self, n=5): super(IntervalArea, self).__init__(name='coordinates', n=n, fetch_fn=_area) @property def __descriptor__(self): # noqa: D401 """dict: A dictionary which summarizes the internal state of the |Descriptor| instance. Keys: name (str): The descriptor name. Note that in some |Descriptor|, it conjointly designates the descriptor name **and** the name of the property read from each |Record| to describe it. property (str): The name of the property written out by the |Descriptor| in each |Record|. type (str): Either ``categorical`` or ``continuous``. schema (dict, tuple, ColorMap): For ``categorical`` descriptors, it must be a dict mapping each ``category name`` to its relevant value (*e.g.* its index or |Color|). For ``continuous`` descriptors, it must be a range description (*i.e.* either a ``(start, end)`` tuple or a object with a ``range`` attribute like a |ColorMap|). See Also: For more information on how :attr:`__descriptor__` is constructed, please see :ref:`descriptor`. """ interface = self._make_interface() interface.update({'name': 'Area', 'property': self.property_name}) try: self.__interface_schema__.validate(interface) except schema.SchemaError as e: raise ValueError('Invalid __descriptor__ returned:\n' '{}\n{}'.format(e.errors, e.autos)) return interface