Source code for compass.utilities.io

"""COMPASS I/O utilities

A lot of this is taken directly from NLR's GAPs repo:
https://github.com/NatLabRockies/gaps
"""

import logging
import contextlib
import collections
from pathlib import Path
from abc import ABC, abstractmethod

import json
import yaml
import toml
import pyjson5
import pprint
from elm.web.file_loader import AsyncLocalFileLoader

from compass.utilities.enums import CaseInsensitiveEnum
from compass.exceptions import COMPASSValueError, COMPASSFileNotFoundError


logger = logging.getLogger(__name__)
_CONFIG_HANDLER_REGISTRY = {}


class _JSON5Formatter:
    """Format input JSON5 data with indentation"""

    def __init__(self, data):
        self.data = data

    def _format_as_json(self):
        """Format the data input with as string with indentation"""
        return json.dumps(self.data, indent=4)


[docs] class Handler(ABC): """ABC for configuration file handler""" def __init_subclass__(cls): super().__init_subclass__() if isinstance(cls.FILE_EXTENSION, str): _CONFIG_HANDLER_REGISTRY[cls.FILE_EXTENSION] = cls else: for file_extension in cls.FILE_EXTENSION: _CONFIG_HANDLER_REGISTRY[file_extension] = cls
[docs] @classmethod def load(cls, file_name): """Load the file contents""" config_str = Path(file_name).read_text(encoding="utf-8") return cls.loads(config_str)
[docs] @classmethod def write(cls, file_name, data): """Write the data to a file""" with Path(file_name).open("w", encoding="utf-8") as config_file: cls.dump(data, config_file)
[docs] @classmethod @abstractmethod def dump(cls, config, stream): """Write the config to a stream (file)"""
[docs] @classmethod @abstractmethod def dumps(cls, config): """Convert the config to a string"""
[docs] @classmethod @abstractmethod def loads(cls, config_str): """Parse the string into a config dictionary"""
@property @abstractmethod def FILE_EXTENSION(self): # noqa: N802 """str: Enum name to use"""
[docs] class JSONHandler(Handler): """JSON config file handler""" FILE_EXTENSION = "json" """JSON file extension"""
[docs] @classmethod def dump(cls, config, stream): """Write the config to a stream (JSON file)""" return json.dump(config, stream, indent=4)
[docs] @classmethod def dumps(cls, config): """Convert the config to a JSON string""" return json.dumps(config, indent=4)
[docs] @classmethod def loads(cls, config_str): """Parse the JSON string into a config dictionary""" return json.loads(config_str)
[docs] class JSON5Handler(Handler): """JSON5 config file handler""" FILE_EXTENSION = "json5" """JSON5 file extension"""
[docs] @classmethod def dump(cls, config, stream): """Write the config to a stream (JSON5 file)""" return pyjson5.encode_io( _JSON5Formatter(config), stream, supply_bytes=False, tojson="_format_as_json", )
[docs] @classmethod def dumps(cls, config): """Convert the config to a JSON5 string""" return pyjson5.encode( _JSON5Formatter(config), tojson="_format_as_json", )
[docs] @classmethod def loads(cls, config_str): """Parse the JSON5 string into a config dictionary""" return pyjson5.decode(config_str, maxdepth=-1)
[docs] class YAMLHandler(Handler): """YAML config file handler""" FILE_EXTENSION = "yaml", "yml" """YAML file extensions"""
[docs] @classmethod def dump(cls, config, stream): """Write the config to a stream (YAML file)""" return yaml.safe_dump(config, stream, indent=2, sort_keys=False)
[docs] @classmethod def dumps(cls, config): """Convert the config to a YAML string""" return yaml.safe_dump(config, indent=2, sort_keys=False)
[docs] @classmethod def loads(cls, config_str): """Parse the YAML string into a config dictionary""" return yaml.safe_load(config_str)
[docs] class TOMLHandler(Handler): """TOML config file handler""" FILE_EXTENSION = "toml" """TOML file extension"""
[docs] @classmethod def dump(cls, config, stream): """Write the config to a stream (TOML file)""" return toml.dump(config, stream)
[docs] @classmethod def dumps(cls, config): """Convert the config to a TOML string""" return toml.dumps(config)
[docs] @classmethod def loads(cls, config_str): """Parse the TOML string into a config dictionary""" return toml.loads(config_str)
class _ConfigType(CaseInsensitiveEnum): """Base config type enum class only meant to be initialized once""" @classmethod def _new_post_hook(cls, obj, value): """Hook for post-processing after __new__; adds methods""" obj.dump = _CONFIG_HANDLER_REGISTRY[value].dump obj.dumps = _CONFIG_HANDLER_REGISTRY[value].dumps obj.load = _CONFIG_HANDLER_REGISTRY[value].load obj.loads = _CONFIG_HANDLER_REGISTRY[value].loads obj.write = _CONFIG_HANDLER_REGISTRY[value].write obj.__doc__ = f"{value.upper()} config file handler" return obj ConfigType = _ConfigType( "ConfigType", { config_type.upper(): config_type for config_type in _CONFIG_HANDLER_REGISTRY }, ) """An enumeration of the parseable config types"""
[docs] def load_config(config_filepath, resolve_paths=True): """Load a config file Parameters ---------- config_filepath : path-like Path to config file. resolve_paths : bool, optional Option to (recursively) resolve file-paths in the dictionary w.r.t the config file directory. By default, ``True``. Returns ------- dict Dictionary containing configuration parameters. Raises ------ COMPASSValueError If input `config_filepath` has no file ending. """ config_filepath = Path(config_filepath).expanduser().resolve() if "." not in config_filepath.name: msg = ( f"Configuration file must have a file-ending. Got: " f"{config_filepath.name}" ) raise COMPASSValueError(msg) if not config_filepath.exists(): msg = f"Config file does not exist: {config_filepath}" raise COMPASSFileNotFoundError(msg) try: config_type = ConfigType(config_filepath.suffix[1:]) except ValueError as err: msg = ( f"Got unknown config file extension: " f"{config_filepath.suffix!r}. Supported extensions are: " f"{', '.join({ct.value for ct in ConfigType})}" ) raise COMPASSValueError(msg) from err config = config_type.load(config_filepath) if resolve_paths: return resolve_all_paths(config, config_filepath.parent) return config
[docs] def resolve_all_paths(container, base_dir): """Perform a deep string replacement and path resolve in `container` Parameters ---------- container : dict or list Container like a dictionary or list that may (or may not) contain relative paths to resolve. base_dir : path-like Base path to directory from which to resolve path string (typically current directory) Returns ------- dict or list Input container with updated strings. """ if isinstance(container, str): # `resolve_path` is safe to call on any string, # even if it is not a path container = resolve_path(container, Path(base_dir)) elif isinstance(container, collections.abc.Mapping): container = { key: resolve_all_paths(val, Path(base_dir)) for key, val in container.items() } elif isinstance(container, collections.abc.Sequence): container = [ resolve_all_paths(item, Path(base_dir)) for item in container ] return container
[docs] def resolve_path(path, base_dir): """Resolve a file path represented by the input string. This function resolves the input string if it resembles a path. Specifically, the string will be resolved if it starts with "``./``" or "``..``", or it if it contains either "``./``" or "``..``" somewhere in the string body. Otherwise, the string is returned unchanged, so this function *is* safe to call on any string, even ones that do not resemble a path. This method delegates the "resolving" logic to :meth:`pathlib.Path.resolve`. This means the path is made absolute, symlinks are resolved, and "``..``" components are eliminated. If the ``path`` input starts with "``./``" or "``..``", it is assumed to be w.r.t the config directory, *not* the run directory. Parameters ---------- path : str Input file path. base_dir : path-like Base path to directory from which to resolve path string (typically current directory). Returns ------- str The resolved path. """ base_dir = Path(base_dir) if path.startswith("./"): path = base_dir / Path(path[2:]) elif path.startswith(".."): path = base_dir / Path(path) elif "./" in path: # this covers both './' and '../' path = Path(path) with contextlib.suppress(AttributeError): # `path` is still a `str` path = path.expanduser().resolve().as_posix() return path
[docs] async def load_local_docs(fps, **kwargs): """Load local documents into `elm` document instances Parameters ---------- fps : Iterable Iterable of paths referencing local files to load. **kwargs Additional keyword arguments forwarded to :class:`elm.web.file_loader.AsyncLocalFileLoader` for configuration such as ``loader``, caching, or parsing options. Returns ------- list of BaseDocument Non-empty loaded documents corresponding to the supplied filepaths. Empty results (e.g., unreadable files) are filtered out of the returned list. Raises ------ elm.exceptions.ELMError Propagated when the underlying loader fails to read one of the provided files and is configured to raise on errors. Notes ----- Detailed debug information about loaded page counts is emitted via the ``compass.utilities.io`` logger at ``TRACE`` level to assist with troubleshooting ingestion runs. """ logger.trace("Loading docs for the following paths:\n%r", fps) logger.trace( "kwargs for AsyncLocalFileLoader:\n%s", pprint.PrettyPrinter().pformat(kwargs), ) file_loader = AsyncLocalFileLoader(**kwargs) docs = await file_loader.fetch_all(*fps) page_lens = { doc.attrs.get("source_fp", "Unknown"): len(doc.pages) for doc in docs } logger.debug( "Loaded the following number of pages for docs:\n%s", pprint.PrettyPrinter().pformat(page_lens), ) return [doc for doc in docs if not doc.empty]