"""Data reading and processing pipeline for multiple file formats.
The DataReader is the main entry point for loading data files. It handles:
- Path resolution (absolute, relative, and glob patterns)
- File type detection based on extension
- Optional file handling (returns None instead of raising errors)
- Custom reader function delegation
- Automatic data processing and transformations (filtering, type casting, etc.)
- Placeholder substitution in filter specifications
File type support is determined by EXTENSION_MAPPING, with custom readers
available via the reader parameter in DataFile configurations.
See Also
--------
:class:`~r2x_core.datafile.DataFile` : File configuration with metadata and processing specs.
:class:`~r2x_core.file_readers.read_file_by_type` : Singledispatch file readers by type.
:func:`~r2x_core.processors.apply_processing` : Data transformation pipeline.
:func:`~r2x_core.datafile_utils.get_file_path` : Path resolution and validation.
"""
from collections.abc import Callable
from pathlib import Path
from typing import Any
from loguru import logger
from rust_ok import Ok
from .datafile import DataFile
from .exceptions import ReaderError
from .file_readers import read_file_by_type
from .file_types import EXTENSION_MAPPING
from .processors import apply_processing, register_transformation
from .utils.files import get_fpath
[docs]
class DataReader:
"""Reader class for loading data files with automatic processing.
The DataReader handles the complete reading pipeline: file discovery,
format detection, reading, and transformations. File-type-specific
reading logic is delegated via singledispatch based on file extension.
See Also
--------
:class:`~r2x_core.datafile.DataFile` : File configuration with specs.
:func:`read_data_file` : Main method for reading data with processing.
"""
def __init__(self) -> None:
"""Initialize the data reader.
No configuration needed; the reader is stateless and uses configuration
from DataFile objects and optional placeholders at read time.
"""
[docs]
def read_data_file(
self,
data_file: DataFile,
*,
folder_path: Path,
placeholders: dict[str, Any] | None = None,
) -> Any:
"""Read a data file.
Parameters
----------
data_file : DataFile
Data file configuration with metadata.
folder_path : Path
Base directory containing the data files.
placeholders : dict[str, Any] | None, optional
Dictionary mapping placeholder variable names to their values.
Used to substitute placeholders like {solve_year} in filter_by.
Default is None.
Returns
-------
Any
The loaded data, type depends on file type.
Raises
------
FileNotFoundError
If a required file does not exist or if a glob pattern matches no files.
ValueError
If glob pattern is malformed (no wildcards) or if placeholders are found
in filter_by but no placeholders dict provided.
MultipleFileError
If a glob pattern matches multiple files (subclass of ValueError).
See Also
--------
:func:`~r2x_core.file_readers.read_file_by_type` : File-type-specific reading.
:func:`~r2x_core.processors.apply_processing` : Transformation pipeline.
:func:`~r2x_core.datafile_utils.get_file_path` : Path resolution.
"""
logger.debug("Starting reading for data_file={}", data_file.name)
is_optional = data_file.info.is_optional if data_file.info else False # By default files are no-opt
fpath_result = get_fpath(data_file, folder_path=folder_path, info=data_file.info)
if fpath_result.is_err():
error = fpath_result.err()
if isinstance(error, FileNotFoundError) and is_optional:
logger.info("Skipping optional file: {}", data_file.name)
return None
raise error
assert isinstance(fpath_result, Ok), "Result should be Ok after error check"
fpath = fpath_result.value
reader = data_file.reader
reader_kwargs = reader.kwargs if reader else {}
if reader and reader.function:
logger.debug(
"Attempting to read data_file{} with reader_function={}",
data_file.name,
data_file.reader,
)
raw_data = reader.function(fpath, **reader_kwargs)
if data_file.proc_spec is not None:
processed_data = apply_processing(
raw_data,
data_file=data_file,
proc_spec=data_file.proc_spec,
placeholders=placeholders,
)
if processed_data.is_err():
raise ReaderError(processed_data.error)
assert isinstance(processed_data, Ok), "Result should be Ok after error check"
processed_data = processed_data.value
else:
processed_data = raw_data
return processed_data
file_type_instance = data_file.file_type
logger.trace(
"Attempting to read data_file={} with {}", data_file.name, type(file_type_instance).__name__
)
raw_data = read_file_by_type(file_type_instance, file_path=fpath, **reader_kwargs)
if data_file.proc_spec is not None:
processed_data = apply_processing(
raw_data, data_file=data_file, proc_spec=data_file.proc_spec, placeholders=placeholders
)
if processed_data.is_err():
raise ReaderError(processed_data.error)
assert isinstance(processed_data, Ok), "Result should be Ok after error check"
processed_data = processed_data.value
else:
processed_data = raw_data
return processed_data
_SUPPORTED_FILE_TYPES: list[str] | None = None
[docs]
def get_supported_file_types(self) -> list[str]:
"""Get list of supported file extensions.
Returns
-------
list[str]
List of supported file extensions.
"""
if self._SUPPORTED_FILE_TYPES is None:
type(self)._SUPPORTED_FILE_TYPES = list(EXTENSION_MAPPING.keys())
# Type checker does not narrow the class-level attribute after the if.
result = type(self)._SUPPORTED_FILE_TYPES
assert result is not None
return result