# Released under the MIT License. See LICENSE for details. # """Functionality for importing, exporting, and validating dataclasses. This allows complex nested dataclasses to be flattened to json-compatible data and restored from said data. It also gracefully handles and preserves unrecognized attribute data, allowing older clients to interact with newer data formats in a nondestructive manner. """ # pylint: disable=too-many-lines # Note: We do lots of comparing of exact types here which is normally # frowned upon (stuff like isinstance() is usually encouraged). # pylint: disable=unidiomatic-typecheck from __future__ import annotations import logging from enum import Enum import dataclasses import typing import datetime from typing import TYPE_CHECKING, TypeVar, Generic # Note: can pull this from typing once we update to Python 3.9+ # noinspection PyProtectedMember from typing_extensions import get_args, get_type_hints, _AnnotatedAlias from efro.util import enum_by_value _pytz_utc: Any # We don't *require* pytz but we want to support it for tzinfos if available. try: import pytz _pytz_utc = pytz.utc except ModuleNotFoundError: _pytz_utc = None # pylint: disable=invalid-name if TYPE_CHECKING: from typing import Any, Dict, Type, Tuple, Optional, List, Set T = TypeVar('T') # Types which we can pass through as-is. SIMPLE_TYPES = {int, bool, str, float, type(None)} # How deep we go when prepping nested types # (basically for detecting recursive types) MAX_RECURSION = 10 # Attr name for data we store on dataclass types as part of prep. PREP_ATTR = '_DCIOPREP' # Attr name for dict of extra attributes included on dataclass instances. # Note that this is only added if extra attributes are present. EXTRA_ATTRS_ATTR = '_DCIOEXATTRS' class Codec(Enum): """Specifies expected data format exported to or imported from.""" # Use only types that will translate cleanly to/from json: lists, # dicts with str keys, bools, ints, floats, and None. JSON = 'json' # Mostly like JSON but passes bytes and datetime objects through # as-is instead of converting them to json-friendly types. FIRESTORE = 'firestore' class IOAttrs: """For specifying io behavior in annotations.""" storagename: Optional[str] = None store_default: bool = True whole_days: bool = False whole_hours: bool = False def __init__(self, storagename: Optional[str] = storagename, store_default: bool = store_default, whole_days: bool = whole_days, whole_hours: bool = whole_hours): # Only store values that differ from class defaults to keep # our instances nice and lean. cls = type(self) if storagename != cls.storagename: self.storagename = storagename if store_default != cls.store_default: self.store_default = store_default if whole_days != cls.whole_days: self.whole_days = whole_days if whole_hours != cls.whole_hours: self.whole_hours = whole_hours def validate_for_field(self, cls: Type, field: dataclasses.Field) -> None: """Ensure the IOAttrs instance is ok to use with the provided field.""" # Turning off store_default requires the field to have either # a default_factory or a default if not self.store_default: default_factory: Any = field.default_factory # type: ignore if (default_factory is dataclasses.MISSING and field.default is dataclasses.MISSING): raise TypeError(f'Field {field.name} of {cls} has' f' neither a default nor a default_factory;' f' store_default=False cannot be set for it.') def validate_datetime(self, value: datetime.datetime, fieldpath: str) -> None: """Ensure a datetime value meets our value requirements.""" if self.whole_days: if any(x != 0 for x in (value.hour, value.minute, value.second, value.microsecond)): raise ValueError( f'Value {value} at {fieldpath} is not a whole day.') if self.whole_hours: if any(x != 0 for x in (value.minute, value.second, value.microsecond)): raise ValueError(f'Value {value} at {fieldpath}' f' is not a whole hour.') class FieldStoragePathCapture: """Utility for obtaining dataclass storage paths in a type safe way. Given dataclass instance foo, FieldStoragePathCapture(foo).bar.eep will return 'bar.eep' (or something like 'b.e' if storagenames are overridden). This can be combined with type-checking tricks that return foo in the type-checker's eyes while returning FieldStoragePathCapture(foo) at runtime in order to grant a measure of type safety to specifying field paths for things such as db queries. Be aware, however, that the type-checker will incorrectly think these lookups are returning actual attr values when they are actually returning strings. """ def __init__(self, obj: Any, path: List[str] = None): if path is None: path = [] if not dataclasses.is_dataclass(obj): raise TypeError(f'Expected a dataclass type/instance;' f' got {type(obj)}.') self._cls = obj if isinstance(obj, type) else type(obj) self._path = path def __getattr__(self, name: str) -> Any: prep = PrepSession(explicit=False).prep_dataclass(self._cls, recursion_level=0) try: anntype = prep.annotations[name] except KeyError as exc: raise AttributeError(f'{type(self)} has no {name} field.') from exc anntype, ioattrs = _parse_annotated(anntype) storagename = (name if (ioattrs is None or ioattrs.storagename is None) else ioattrs.storagename) origin = _get_origin(anntype) path = self._path + [storagename] if dataclasses.is_dataclass(origin): return FieldStoragePathCapture(origin, path=path) return '.'.join(path) def dataclass_to_dict(obj: Any, codec: Codec = Codec.JSON, coerce_to_float: bool = True) -> dict: """Given a dataclass object, return a json-friendly dict. All values will be checked to ensure they match the types specified on fields. Note that a limited set of types and data configurations is supported. Values with type Any will be checked to ensure they match types supported directly by json. This does not include types such as tuples which are implicitly translated by Python's json module (as this would break the ability to do a lossless round-trip with data). If coerce_to_float is True, integer values present on float typed fields will be converted to floats in the dict output. If False, a TypeError will be triggered. """ out = _Outputter(obj, create=True, codec=codec, coerce_to_float=coerce_to_float).run() assert isinstance(out, dict) return out def dataclass_to_json(obj: Any, coerce_to_float: bool = True) -> str: """Utility function; return a json string from a dataclass instance. Basically json.dumps(dataclass_to_dict(...)). """ import json return json.dumps( dataclass_to_dict(obj=obj, coerce_to_float=coerce_to_float, codec=Codec.JSON), separators=(',', ':'), ) def dataclass_from_dict(cls: Type[T], values: dict, codec: Codec = Codec.JSON, coerce_to_float: bool = True, allow_unknown_attrs: bool = True, discard_unknown_attrs: bool = False) -> T: """Given a dict, return a dataclass of a given type. The dict must be formatted to match the specified codec (generally json-friendly object types). This means that sequence values such as tuples or sets should be passed as lists, enums should be passed as their associated values, nested dataclasses should be passed as dicts, etc. All values are checked to ensure their types/values are valid. Data for attributes of type Any will be checked to ensure they match types supported directly by json. This does not include types such as tuples which are implicitly translated by Python's json module (as this would break the ability to do a lossless round-trip with data). If coerce_to_float is True, int values passed for float typed fields will be converted to float values. Otherwise a TypeError is raised. If allow_unknown_attrs is False, AttributeErrors will be raised for attributes present in the dict but not on the data class. Otherwise they will be preserved as part of the instance and included if it is exported back to a dict, unless discard_unknown_attrs is True, in which case they will simply be discarded. """ return _Inputter(cls, codec=codec, coerce_to_float=coerce_to_float, allow_unknown_attrs=allow_unknown_attrs, discard_unknown_attrs=discard_unknown_attrs).run(values) def dataclass_from_json(cls: Type[T], json_str: str, coerce_to_float: bool = True, allow_unknown_attrs: bool = True, discard_unknown_attrs: bool = False) -> T: """Utility function; return a dataclass instance given a json string. Basically dataclass_from_dict(json.loads(...)) """ import json return dataclass_from_dict(cls=cls, values=json.loads(json_str), coerce_to_float=coerce_to_float, allow_unknown_attrs=allow_unknown_attrs, discard_unknown_attrs=discard_unknown_attrs) def dataclass_validate(obj: Any, coerce_to_float: bool = True, codec: Codec = Codec.JSON) -> None: """Ensure that values in a dataclass instance are the correct types.""" # Simply run an output pass but tell it not to generate data; # only run validation. _Outputter(obj, create=False, codec=codec, coerce_to_float=coerce_to_float).run() def ioprep(cls: Type) -> None: """Prep a dataclass type for use with this module's functionality. Prepping ensures that all types contained in a data class as well as the usage of said types are supported by this module and pre-builds necessary constructs needed for encoding/decoding/etc. Prepping will happen on-the-fly as needed, but a warning will be emitted in such cases, as it is better to explicitly prep all used types early in a process to ensure any invalid types or configuration are caught immediately. Prepping a dataclass involves evaluating its type annotations, which, as of PEP 563, are stored simply as strings. This evaluation is done in the module namespace containing the class, so all referenced types must be defined at that level. """ PrepSession(explicit=True).prep_dataclass(cls, recursion_level=0) def ioprepped(cls: Type[T]) -> Type[T]: """Class decorator for easily prepping a dataclass at definition time. Note that in some cases it may not be possible to prep a dataclass immediately (such as when its type annotations refer to forward-declared types). In these cases, dataclass_prep() should be explicitly called for the class as soon as possible; ideally at module import time to expose any errors as early as possible in execution. """ ioprep(cls) return cls @dataclasses.dataclass class PrepData: """Data we prepare and cache for a class during prep. This data is used as part of the encoding/decoding/validating process. """ # Resolved annotation data with 'live' classes. annotations: Dict[str, Any] # Map of storage names to attr names. storage_names_to_attr_names: Dict[str, str] class PrepSession: """Context for a prep.""" def __init__(self, explicit: bool): self.explicit = explicit def prep_dataclass(self, cls: Type, recursion_level: int) -> PrepData: """Run prep on a dataclass if necessary and return its prep data.""" # We should only need to do this once per dataclass. existing_data = getattr(cls, PREP_ATTR, None) if existing_data is not None: assert isinstance(existing_data, PrepData) return existing_data # If we run into classes containing themselves, we may have # to do something smarter to handle it. if recursion_level > MAX_RECURSION: raise RuntimeError('Max recursion exceeded.') # We should only be passed classes which are dataclasses. if not isinstance(cls, type) or not dataclasses.is_dataclass(cls): raise TypeError(f'Passed arg {cls} is not a dataclass type.') # Generate a warning on non-explicit preps; we prefer prep to # happen explicitly at runtime so errors can be detected early on. if not self.explicit: logging.warning( 'efro.dataclassio: implicitly prepping dataclass: %s.' ' It is highly recommended to explicitly prep dataclasses' ' as soon as possible after definition (via' ' efro.dataclassio.ioprep() or the' ' @efro.dataclassio.ioprepped decorator).', cls) try: # NOTE: perhaps we want to expose the globalns/localns args # to this? # pylint: disable=unexpected-keyword-arg resolved_annotations = get_type_hints(cls, include_extras=True) # pylint: enable=unexpected-keyword-arg except Exception as exc: raise RuntimeError( f'dataclassio prep for {cls} failed with error: {exc}.' f' Make sure all types used in annotations are defined' f' at the module level or add them as part of an explicit' f' prep call.') from exc # noinspection PyDataclass fields = dataclasses.fields(cls) fields_by_name = {f.name: f for f in fields} all_storage_names: Set[str] = set() storage_names_to_attr_names: Dict[str, str] = {} # Ok; we've resolved actual types for this dataclass. # now recurse through them, verifying that we support all contained # types and prepping any contained dataclass types. for attrname, anntype in resolved_annotations.items(): anntype, ioattrs = _parse_annotated(anntype) # If we found attached IOAttrs data, make sure it contains # valid values for the field it is attached to. if ioattrs is not None: ioattrs.validate_for_field(cls, fields_by_name[attrname]) if ioattrs.storagename is not None: storagename = ioattrs.storagename storage_names_to_attr_names[ioattrs.storagename] = attrname else: storagename = attrname else: storagename = attrname # Make sure we don't have any clashes in our storage names. if storagename in all_storage_names: raise TypeError(f'Multiple attrs on {cls} are using' f' storage-name \'{storagename}\'') all_storage_names.add(storagename) self.prep_type(cls, attrname, anntype, recursion_level=recursion_level + 1) # Success! Store our resolved stuff with the class and we're done. prepdata = PrepData( annotations=resolved_annotations, storage_names_to_attr_names=storage_names_to_attr_names) setattr(cls, PREP_ATTR, prepdata) return prepdata def prep_type(self, cls: Type, attrname: str, anntype: Any, recursion_level: int) -> None: """Run prep on a dataclass.""" # pylint: disable=too-many-return-statements # pylint: disable=too-many-branches # If we run into classes containing themselves, we may have # to do something smarter to handle it. if recursion_level > MAX_RECURSION: raise RuntimeError('Max recursion exceeded.') origin = _get_origin(anntype) if origin is typing.Union: self.prep_union(cls, attrname, anntype, recursion_level=recursion_level + 1) return if anntype is typing.Any: return # Everything below this point assumes the annotation type resolves # to a concrete type. if not isinstance(origin, type): raise TypeError( f'Unsupported type found for \'{attrname}\' on {cls}:' f' {anntype}') if origin in SIMPLE_TYPES: return # For sets and lists, check out their single contained type (if any). if origin in (list, set): childtypes = typing.get_args(anntype) if len(childtypes) == 0: # This is equivalent to Any; nothing else needs checking. return if len(childtypes) > 1: raise TypeError( f'Unrecognized typing arg count {len(childtypes)}' f" for {anntype} attr '{attrname}' on {cls}") self.prep_type(cls, attrname, childtypes[0], recursion_level=recursion_level + 1) return if origin is dict: childtypes = typing.get_args(anntype) assert len(childtypes) in (0, 2) # For key types we support Any, str, int, # and Enums with uniform str/int values. if not childtypes or childtypes[0] is typing.Any: # 'Any' needs no further checks (just checked per-instance). pass elif childtypes[0] in (str, int): # str and int are all good as keys. pass elif issubclass(childtypes[0], Enum): # Allow our usual str or int enum types as keys. self.prep_enum(childtypes[0]) else: raise TypeError( f'Dict key type {childtypes[0]} for \'{attrname}\'' f' on {cls.__name__} is not supported by dataclassio.') # For value types we support any of our normal types. if not childtypes or _get_origin(childtypes[1]) is typing.Any: # 'Any' needs no further checks (just checked per-instance). pass else: self.prep_type(cls, attrname, childtypes[1], recursion_level=recursion_level + 1) return # For Tuples, simply check individual member types. # (and, for now, explicitly disallow zero member types or usage # of ellipsis) if origin is tuple: childtypes = typing.get_args(anntype) if not childtypes: raise TypeError( f'Tuple at \'{attrname}\'' f' has no type args; dataclassio requires type args.') if childtypes[-1] is ...: raise TypeError(f'Found ellipsis as part of type for' f' \'{attrname}\' on {cls.__name__};' f' these are not' f' supported by dataclassio.') for childtype in childtypes: self.prep_type(cls, attrname, childtype, recursion_level=recursion_level + 1) return if issubclass(origin, Enum): self.prep_enum(origin) return # We allow datetime objects (and google's extended subclass of them # used in firestore, which is why we don't look for exact type here). if issubclass(origin, datetime.datetime): return if dataclasses.is_dataclass(origin): self.prep_dataclass(origin, recursion_level=recursion_level + 1) return if origin is bytes: return raise TypeError(f"Attr '{attrname}' on {cls.__name__} contains" f" type '{anntype}'" f' which is not supported by dataclassio.') def prep_union(self, cls: Type, attrname: str, anntype: Any, recursion_level: int) -> None: """Run prep on a Union type.""" typeargs = typing.get_args(anntype) if (len(typeargs) != 2 or len([c for c in typeargs if c is type(None)]) != 1): raise TypeError(f'Union {anntype} for attr \'{attrname}\' on' f' {cls.__name__} is not supported by dataclassio;' f' only 2 member Unions with one type being None' f' are supported.') for childtype in typeargs: self.prep_type(cls, attrname, childtype, recursion_level=recursion_level + 1) def prep_enum(self, enumtype: Type[Enum]) -> None: """Run prep on an enum type.""" valtype: Any = None # We currently support enums with str or int values; fail if we # find any others. for enumval in enumtype: if not isinstance(enumval.value, (str, int)): raise TypeError(f'Enum value {enumval} has value type' f' {type(enumval.value)}; only str and int is' f' supported by dataclassio.') if valtype is None: valtype = type(enumval.value) else: if type(enumval.value) is not valtype: raise TypeError(f'Enum type {enumtype} has multiple' f' value types; dataclassio requires' f' them to be uniform.') def _is_valid_for_codec(obj: Any, codec: Codec) -> bool: """Return whether a value consists solely of json-supported types. Note that this does not include things like tuples which are implicitly translated to lists by python's json module. """ if obj is None: return True objtype = type(obj) if objtype in (int, float, str, bool): return True if objtype is dict: # JSON 'objects' supports only string dict keys, but all value types. return all( type(k) is str and _is_valid_for_codec(v, codec) for k, v in obj.items()) if objtype is list: return all(_is_valid_for_codec(elem, codec) for elem in obj) # A few things are valid in firestore but not json. if issubclass(objtype, datetime.datetime) or objtype is bytes: return codec is Codec.FIRESTORE return False def _raise_type_error(fieldpath: str, valuetype: Type, expected: Tuple[Type, ...]) -> None: """Raise an error when a field value's type does not match expected.""" assert isinstance(expected, tuple) assert all(isinstance(e, type) for e in expected) if len(expected) == 1: expected_str = expected[0].__name__ else: names = ', '.join(t.__name__ for t in expected) expected_str = f'Union[{names}]' raise TypeError(f'Invalid value type for "{fieldpath}";' f' expected "{expected_str}", got' f' "{valuetype.__name__}".') def _get_origin(anntype: Any) -> Any: """Given a type annotation, return its origin or itself if there is none. This differs from typing.get_origin in that it will never return None. This lets us use the same code path for handling typing.List that we do for handling list, which is good since they can be used interchangeably in annotations. """ origin = typing.get_origin(anntype) return anntype if origin is None else origin class _Outputter: """Validates or exports data contained in a dataclass instance.""" def __init__(self, obj: Any, create: bool, codec: Codec, coerce_to_float: bool) -> None: self._obj = obj self._create = create self._codec = codec self._coerce_to_float = coerce_to_float def run(self) -> Any: """Do the thing.""" return self._process_dataclass(type(self._obj), self._obj, '') def _process_dataclass(self, cls: Type, obj: Any, fieldpath: str) -> Any: # pylint: disable=too-many-locals # pylint: disable=too-many-branches prep = PrepSession(explicit=False).prep_dataclass(type(obj), recursion_level=0) fields = dataclasses.fields(obj) out: Optional[Dict[str, Any]] = {} if self._create else None for field in fields: fieldname = field.name if fieldpath: subfieldpath = f'{fieldpath}.{fieldname}' else: subfieldpath = fieldname anntype = prep.annotations[fieldname] value = getattr(obj, fieldname) anntype, ioattrs = _parse_annotated(anntype) # If we're not storing default values for this fella, # we can skip all output processing if we've got a default value. if ioattrs is not None and not ioattrs.store_default: default_factory: Any = field.default_factory # type: ignore if default_factory is not dataclasses.MISSING: if default_factory() == value: continue elif field.default is not dataclasses.MISSING: if field.default == value: continue else: raise RuntimeError( f'Field {fieldname} of {cls.__name__} has' f' neither a default nor a default_factory;' f' store_default=False cannot be set for it.' f' (AND THIS SHOULD HAVE BEEN CAUGHT IN PREP!)') outvalue = self._process_value(cls, subfieldpath, anntype, value, ioattrs) if self._create: assert out is not None storagename = (fieldname if (ioattrs is None or ioattrs.storagename is None) else ioattrs.storagename) out[storagename] = outvalue # If there's extra-attrs stored on us, check/include them. extra_attrs = getattr(obj, EXTRA_ATTRS_ATTR, None) if isinstance(extra_attrs, dict): if not _is_valid_for_codec(extra_attrs, self._codec): raise TypeError( f'Extra attrs on {fieldpath} contains data type(s)' f' not supported by json.') if self._create: assert out is not None out.update(extra_attrs) return out def _process_value(self, cls: Type, fieldpath: str, anntype: Any, value: Any, ioattrs: Optional[IOAttrs]) -> Any: # pylint: disable=too-many-return-statements # pylint: disable=too-many-branches # pylint: disable=too-many-statements origin = _get_origin(anntype) if origin is typing.Any: if not _is_valid_for_codec(value, self._codec): raise TypeError( f'Invalid value type for \'{fieldpath}\';' f" 'Any' typed values must contain types directly" f' supported by the specified codec ({self._codec.name});' f' found \'{type(value).__name__}\' which is not.') return value if self._create else None if origin is typing.Union: # Currently the only unions we support are None/Value # (translated from Optional), which we verified on prep. # So let's treat this as a simple optional case. if value is None: return None childanntypes_l = [ c for c in typing.get_args(anntype) if c is not type(None) ] assert len(childanntypes_l) == 1 return self._process_value(cls, fieldpath, childanntypes_l[0], value, ioattrs) # Everything below this point assumes the annotation type resolves # to a concrete type. (This should have been verified at prep time). assert isinstance(origin, type) # For simple flat types, look for exact matches: if origin in SIMPLE_TYPES: if type(value) is not origin: # Special case: if they want to coerce ints to floats, do so. if (self._coerce_to_float and origin is float and type(value) is int): return float(value) if self._create else None _raise_type_error(fieldpath, type(value), (origin, )) return value if self._create else None if origin is tuple: if not isinstance(value, tuple): raise TypeError(f'Expected a tuple for {fieldpath};' f' found a {type(value)}') childanntypes = typing.get_args(anntype) # We should have verified this was non-zero at prep-time assert childanntypes if len(value) != len(childanntypes): raise TypeError(f'Tuple at {fieldpath} contains' f' {len(value)} values; type specifies' f' {len(childanntypes)}.') if self._create: return [ self._process_value(cls, fieldpath, childanntypes[i], x, ioattrs) for i, x in enumerate(value) ] for i, x in enumerate(value): self._process_value(cls, fieldpath, childanntypes[i], x, ioattrs) return None if origin is list: if not isinstance(value, list): raise TypeError(f'Expected a list for {fieldpath};' f' found a {type(value)}') childanntypes = typing.get_args(anntype) # 'Any' type children; make sure they are valid values for # the specified codec. if len(childanntypes) == 0 or childanntypes[0] is typing.Any: for i, child in enumerate(value): if not _is_valid_for_codec(child, self._codec): raise TypeError( f'Item {i} of {fieldpath} contains' f' data type(s) not supported by the specified' f' codec ({self._codec.name}).') # Hmm; should we do a copy here? return value if self._create else None # We contain elements of some specified type. assert len(childanntypes) == 1 if self._create: return [ self._process_value(cls, fieldpath, childanntypes[0], x, ioattrs) for x in value ] for x in value: self._process_value(cls, fieldpath, childanntypes[0], x, ioattrs) return None if origin is set: if not isinstance(value, set): raise TypeError(f'Expected a set for {fieldpath};' f' found a {type(value)}') childanntypes = typing.get_args(anntype) # 'Any' type children; make sure they are valid Any values. if len(childanntypes) == 0 or childanntypes[0] is typing.Any: for child in value: if not _is_valid_for_codec(child, self._codec): raise TypeError( f'Set at {fieldpath} contains' f' data type(s) not supported by the' f' specified codec ({self._codec.name}).') return list(value) if self._create else None # We contain elements of some specified type. assert len(childanntypes) == 1 if self._create: # Note: we output json-friendly values so this becomes # a list. return [ self._process_value(cls, fieldpath, childanntypes[0], x, ioattrs) for x in value ] for x in value: self._process_value(cls, fieldpath, childanntypes[0], x, ioattrs) return None if origin is dict: return self._process_dict(cls, fieldpath, anntype, value, ioattrs) if dataclasses.is_dataclass(origin): if not isinstance(value, origin): raise TypeError(f'Expected a {origin} for {fieldpath};' f' found a {type(value)}.') return self._process_dataclass(cls, value, fieldpath) if issubclass(origin, Enum): if not isinstance(value, origin): raise TypeError(f'Expected a {origin} for {fieldpath};' f' found a {type(value)}.') # At prep-time we verified that these enums had valid value # types, so we can blindly return it here. return value.value if self._create else None if issubclass(origin, datetime.datetime): if not isinstance(value, origin): raise TypeError(f'Expected a {origin} for {fieldpath};' f' found a {type(value)}.') _ensure_datetime_is_timezone_aware(value) if ioattrs is not None: ioattrs.validate_datetime(value, fieldpath) if self._codec is Codec.FIRESTORE: return value assert self._codec is Codec.JSON return [ value.year, value.month, value.day, value.hour, value.minute, value.second, value.microsecond ] if self._create else None if origin is bytes: return self._process_bytes(cls, fieldpath, value) raise TypeError( f"Field '{fieldpath}' of type '{anntype}' is unsupported here.") def _process_bytes(self, cls: Type, fieldpath: str, value: bytes) -> Any: import base64 if not isinstance(value, bytes): raise TypeError( f'Expected bytes for {fieldpath} on {cls.__name__};' f' found a {type(value)}.') if not self._create: return None # In JSON we convert to base64, but firestore directly supports bytes. if self._codec is Codec.JSON: return base64.b64encode(value).decode() assert self._codec is Codec.FIRESTORE return value def _process_dict(self, cls: Type, fieldpath: str, anntype: Any, value: dict, ioattrs: Optional[IOAttrs]) -> Any: # pylint: disable=too-many-branches if not isinstance(value, dict): raise TypeError(f'Expected a dict for {fieldpath};' f' found a {type(value)}.') childtypes = typing.get_args(anntype) assert len(childtypes) in (0, 2) # We treat 'Any' dicts simply as json; we don't do any translating. if not childtypes or childtypes[0] is typing.Any: if not isinstance(value, dict) or not _is_valid_for_codec( value, self._codec): raise TypeError( f'Invalid value for Dict[Any, Any]' f' at \'{fieldpath}\' on {cls.__name__};' f' all keys and values must be directly compatible' f' with the specified codec ({self._codec.name})' f' when dict type is Any.') return value if self._create else None # Ok; we've got a definite key type (which we verified as valid # during prep). Make sure all keys match it. out: Optional[Dict] = {} if self._create else None keyanntype, valanntype = childtypes # str keys we just export directly since that's supported by json. if keyanntype is str: for key, val in value.items(): if not isinstance(key, str): raise TypeError( f'Got invalid key type {type(key)} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected {keyanntype}.') outval = self._process_value(cls, fieldpath, valanntype, val, ioattrs) if self._create: assert out is not None out[key] = outval # int keys are stored as str versions of themselves. elif keyanntype is int: for key, val in value.items(): if not isinstance(key, int): raise TypeError( f'Got invalid key type {type(key)} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected an int.') outval = self._process_value(cls, fieldpath, valanntype, val, ioattrs) if self._create: assert out is not None out[str(key)] = outval elif issubclass(keyanntype, Enum): for key, val in value.items(): if not isinstance(key, keyanntype): raise TypeError( f'Got invalid key type {type(key)} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected a {keyanntype}.') outval = self._process_value(cls, fieldpath, valanntype, val, ioattrs) if self._create: assert out is not None out[str(key.value)] = outval else: raise RuntimeError(f'Unhandled dict out-key-type {keyanntype}') return out class _Inputter(Generic[T]): def __init__(self, cls: Type[T], codec: Codec, coerce_to_float: bool, allow_unknown_attrs: bool = True, discard_unknown_attrs: bool = False): self._cls = cls self._codec = codec self._coerce_to_float = coerce_to_float self._allow_unknown_attrs = allow_unknown_attrs self._discard_unknown_attrs = discard_unknown_attrs if not allow_unknown_attrs and discard_unknown_attrs: raise ValueError('discard_unknown_attrs cannot be True' ' when allow_unknown_attrs is False.') def run(self, values: dict) -> T: """Do the thing.""" out = self._dataclass_from_input(self._cls, '', values) assert isinstance(out, self._cls) return out def _value_from_input(self, cls: Type, fieldpath: str, anntype: Any, value: Any, ioattrs: Optional[IOAttrs]) -> Any: """Convert an assigned value to what a dataclass field expects.""" # pylint: disable=too-many-return-statements # pylint: disable=too-many-branches origin = _get_origin(anntype) if origin is typing.Any: if not _is_valid_for_codec(value, self._codec): raise TypeError(f'Invalid value type for \'{fieldpath}\';' f' \'Any\' typed values must contain only' f' types directly supported by the specified' f' codec ({self._codec.name}); found' f' \'{type(value).__name__}\' which is not.') return value if origin is typing.Union: # Currently the only unions we support are None/Value # (translated from Optional), which we verified on prep. # So let's treat this as a simple optional case. if value is None: return None childanntypes_l = [ c for c in typing.get_args(anntype) if c is not type(None) ] assert len(childanntypes_l) == 1 return self._value_from_input(cls, fieldpath, childanntypes_l[0], value, ioattrs) # Everything below this point assumes the annotation type resolves # to a concrete type. (This should have been verified at prep time). assert isinstance(origin, type) if origin in SIMPLE_TYPES: if type(value) is not origin: # Special case: if they want to coerce ints to floats, do so. if (self._coerce_to_float and origin is float and type(value) is int): return float(value) _raise_type_error(fieldpath, type(value), (origin, )) return value if origin in {list, set}: return self._sequence_from_input(cls, fieldpath, anntype, value, origin, ioattrs) if origin is tuple: return self._tuple_from_input(cls, fieldpath, anntype, value, ioattrs) if origin is dict: return self._dict_from_input(cls, fieldpath, anntype, value, ioattrs) if dataclasses.is_dataclass(origin): return self._dataclass_from_input(origin, fieldpath, value) if issubclass(origin, Enum): return enum_by_value(origin, value) if issubclass(origin, datetime.datetime): return self._datetime_from_input(cls, fieldpath, value, ioattrs) if origin is bytes: return self._bytes_from_input(origin, fieldpath, value) raise TypeError( f"Field '{fieldpath}' of type '{anntype}' is unsupported here.") def _bytes_from_input(self, cls: Type, fieldpath: str, value: Any) -> bytes: """Given input data, returns bytes.""" import base64 # For firestore, bytes are passed as-is. Otherwise they're encoded # as base64. if self._codec is Codec.FIRESTORE: if not isinstance(value, bytes): raise TypeError(f'Expected a bytes object for {fieldpath}' f' on {cls.__name__}; got a {type(value)}.') return value assert self._codec is Codec.JSON if not isinstance(value, str): raise TypeError(f'Expected a string object for {fieldpath}' f' on {cls.__name__}; got a {type(value)}.') return base64.b64decode(value) def _dataclass_from_input(self, cls: Type, fieldpath: str, values: dict) -> Any: """Given a dict, instantiates a dataclass of the given type. The dict must be in the json-friendly format as emitted from dataclass_to_dict. This means that sequence values such as tuples or sets should be passed as lists, enums should be passed as their associated values, and nested dataclasses should be passed as dicts. """ # pylint: disable=too-many-locals if not isinstance(values, dict): raise TypeError( f'Expected a dict for {fieldpath} on {cls.__name__};' f' got a {type(values)}.') prep = PrepSession(explicit=False).prep_dataclass(cls, recursion_level=0) extra_attrs = {} # noinspection PyDataclass fields = dataclasses.fields(cls) fields_by_name = {f.name: f for f in fields} args: Dict[str, Any] = {} for rawkey, value in values.items(): key = prep.storage_names_to_attr_names.get(rawkey, rawkey) field = fields_by_name.get(key) # Store unknown attrs off to the side (or error if desired). if field is None: if self._allow_unknown_attrs: if self._discard_unknown_attrs: continue # Treat this like 'Any' data; ensure that it is valid # raw json. if not _is_valid_for_codec(value, self._codec): raise TypeError( f'Unknown attr \'{key}\'' f' on {fieldpath} contains data type(s)' f' not supported by the specified codec' f' ({self._codec.name}).') extra_attrs[key] = value else: raise AttributeError( f"'{cls.__name__}' has no '{key}' field.") else: fieldname = field.name anntype = prep.annotations[fieldname] anntype, ioattrs = _parse_annotated(anntype) subfieldpath = (f'{fieldpath}.{fieldname}' if fieldpath else fieldname) args[key] = self._value_from_input(cls, subfieldpath, anntype, value, ioattrs) try: out = cls(**args) except Exception as exc: raise RuntimeError(f'Error instantiating class {cls.__name__}' f' at {fieldpath}: {exc}') from exc if extra_attrs: setattr(out, EXTRA_ATTRS_ATTR, extra_attrs) return out def _dict_from_input(self, cls: Type, fieldpath: str, anntype: Any, value: Any, ioattrs: Optional[IOAttrs]) -> Any: # pylint: disable=too-many-branches # pylint: disable=too-many-locals if not isinstance(value, dict): raise TypeError( f'Expected a dict for \'{fieldpath}\' on {cls.__name__};' f' got a {type(value)}.') childtypes = typing.get_args(anntype) assert len(childtypes) in (0, 2) out: Dict # We treat 'Any' dicts simply as json; we don't do any translating. if not childtypes or childtypes[0] is typing.Any: if not isinstance(value, dict) or not _is_valid_for_codec( value, self._codec): raise TypeError(f'Got invalid value for Dict[Any, Any]' f' at \'{fieldpath}\' on {cls.__name__};' f' all keys and values must be' f' compatible with the specified codec' f' ({self._codec.name}).') out = value else: out = {} keyanntype, valanntype = childtypes # Ok; we've got definite key/value types (which we verified as # valid during prep). Run all keys/values through it. # str keys we just take directly since that's supported by json. if keyanntype is str: for key, val in value.items(): if not isinstance(key, str): raise TypeError( f'Got invalid key type {type(key)} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected a str.') out[key] = self._value_from_input(cls, fieldpath, valanntype, val, ioattrs) # int keys are stored in json as str versions of themselves. elif keyanntype is int: for key, val in value.items(): if not isinstance(key, str): raise TypeError( f'Got invalid key type {type(key)} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected a str.') try: keyint = int(key) except ValueError as exc: raise TypeError( f'Got invalid key value {key} for' f' dict key at \'{fieldpath}\' on {cls.__name__};' f' expected an int in string form.') from exc out[keyint] = self._value_from_input( cls, fieldpath, valanntype, val, ioattrs) elif issubclass(keyanntype, Enum): # In prep we verified that all these enums' values have # the same type, so we can just look at the first to see if # this is a string enum or an int enum. enumvaltype = type(next(iter(keyanntype)).value) assert enumvaltype in (int, str) if enumvaltype is str: for key, val in value.items(): try: enumval = enum_by_value(keyanntype, key) except ValueError as exc: raise ValueError( f'Got invalid key value {key} for' f' dict key at \'{fieldpath}\'' f' on {cls.__name__};' f' expected a value corresponding to' f' a {keyanntype}.') from exc out[enumval] = self._value_from_input( cls, fieldpath, valanntype, val, ioattrs) else: for key, val in value.items(): try: enumval = enum_by_value(keyanntype, int(key)) except (ValueError, TypeError) as exc: raise ValueError( f'Got invalid key value {key} for' f' dict key at \'{fieldpath}\'' f' on {cls.__name__};' f' expected {keyanntype} value (though' f' in string form).') from exc out[enumval] = self._value_from_input( cls, fieldpath, valanntype, val, ioattrs) else: raise RuntimeError(f'Unhandled dict in-key-type {keyanntype}') return out def _sequence_from_input(self, cls: Type, fieldpath: str, anntype: Any, value: Any, seqtype: Type, ioattrs: Optional[IOAttrs]) -> Any: # Because we are json-centric, we expect a list for all sequences. if type(value) is not list: raise TypeError(f'Invalid input value for "{fieldpath}";' f' expected a list, got a {type(value).__name__}') childanntypes = typing.get_args(anntype) # 'Any' type children; make sure they are valid json values # and then just grab them. if len(childanntypes) == 0 or childanntypes[0] is typing.Any: for i, child in enumerate(value): if not _is_valid_for_codec(child, self._codec): raise TypeError(f'Item {i} of {fieldpath} contains' f' data type(s) not supported by json.') return value if type(value) is seqtype else seqtype(value) # We contain elements of some specified type. assert len(childanntypes) == 1 childanntype = childanntypes[0] return seqtype( self._value_from_input(cls, fieldpath, childanntype, i, ioattrs) for i in value) def _datetime_from_input(self, cls: Type, fieldpath: str, value: Any, ioattrs: Optional[IOAttrs]) -> Any: # For firestore we expect a datetime object. if self._codec is Codec.FIRESTORE: # Don't compare exact type here, as firestore can give us # a subclass with extended precision. if not isinstance(value, datetime.datetime): raise TypeError( f'Invalid input value for "{fieldpath}" on' f' "{cls.__name__}";' f' expected a datetime, got a {type(value).__name__}') _ensure_datetime_is_timezone_aware(value) return value assert self._codec is Codec.JSON # We expect a list of 7 ints. if type(value) is not list: raise TypeError( f'Invalid input value for "{fieldpath}" on "{cls.__name__}";' f' expected a list, got a {type(value).__name__}') if len(value) != 7 or not all(isinstance(x, int) for x in value): raise TypeError( f'Invalid input value for "{fieldpath}" on "{cls.__name__}";' f' expected a list of 7 ints.') out = datetime.datetime( # type: ignore *value, tzinfo=datetime.timezone.utc) if ioattrs is not None: ioattrs.validate_datetime(out, fieldpath) return out def _tuple_from_input(self, cls: Type, fieldpath: str, anntype: Any, value: Any, ioattrs: Optional[IOAttrs]) -> Any: out: List = [] # Because we are json-centric, we expect a list for all sequences. if type(value) is not list: raise TypeError(f'Invalid input value for "{fieldpath}";' f' expected a list, got a {type(value).__name__}') childanntypes = typing.get_args(anntype) # We should have verified this to be non-zero at prep-time. assert childanntypes if len(value) != len(childanntypes): raise TypeError(f'Invalid tuple input for "{fieldpath}";' f' expected {len(childanntypes)} values,' f' found {len(value)}.') for i, childanntype in enumerate(childanntypes): childval = value[i] # 'Any' type children; make sure they are valid json values # and then just grab them. if childanntype is typing.Any: if not _is_valid_for_codec(childval, self._codec): raise TypeError(f'Item {i} of {fieldpath} contains' f' data type(s) not supported by json.') out.append(childval) else: out.append( self._value_from_input(cls, fieldpath, childanntype, childval, ioattrs)) assert len(out) == len(childanntypes) return tuple(out) def _ensure_datetime_is_timezone_aware(value: datetime.datetime) -> None: # We only support timezone-aware utc times. if (value.tzinfo is not datetime.timezone.utc and (_pytz_utc is None or value.tzinfo is not _pytz_utc)): raise ValueError( 'datetime values must have timezone set as timezone.utc') def _parse_annotated(anntype: Any) -> Tuple[Any, Optional[IOAttrs]]: """Parse Annotated() constructs, returning annotated type & IOAttrs.""" # If we get an Annotated[foo, bar, eep] we take # foo as the actual type and we look for IOAttrs instances in # bar/eep to affect our behavior. ioattrs: Optional[IOAttrs] = None if isinstance(anntype, _AnnotatedAlias): annargs = get_args(anntype) for annarg in annargs[1:]: if isinstance(annarg, IOAttrs): if ioattrs is not None: raise RuntimeError( 'Multiple IOAttrs instances found for a' ' single annotation; this is not supported.') ioattrs = annarg # I occasionally just throw a 'x' down when I mean IOAttrs('x'); # catch these mistakes. elif isinstance(annarg, (str, int, float, bool)): raise RuntimeError( f'Raw {type(annarg)} found in Annotated[] entry:' f' {anntype}; this is probably not what you intended.') anntype = annargs[0] return anntype, ioattrs