Working with DataFiles¶
DataFiles configure how to load and process external data sources.
Basic DataFile¶
>>> import tempfile
>>> from r2x_core import DataFile, FileInfo
>>> from pathlib import Path
>>> with tempfile.TemporaryDirectory() as tmpdir:
... csv_file = Path(tmpdir) / "data.csv"
... _ = csv_file.write_text("a,b,c\n1,2,3\n")
... file_spec = DataFile(
... name="data",
... fpath=csv_file,
... info=FileInfo(description="My data", is_input=True)
... )
... print(file_spec.name)
... print(file_spec.info.description)
data
My data
Tabular Processing¶
Define transformations for CSV and Excel files:
>>> import tempfile
>>> from r2x_core import DataFile, TabularProcessing
>>> from pathlib import Path
>>> with tempfile.TemporaryDirectory() as tmpdir:
... csv_file = Path(tmpdir) / "data.csv"
... _ = csv_file.write_text("old_name,unused,other\n1,2,3\n")
... data_file = DataFile(
... name="processed",
... fpath=csv_file,
... proc_spec=TabularProcessing(
... column_mapping={"old_name": "new_name"},
... drop_columns=["unused"]
... )
... )
... print(data_file.proc_spec.column_mapping)
{'old_name': 'new_name'}
JSON Processing¶
Define transformations for JSON files:
>>> import tempfile
>>> import json
>>> from r2x_core import DataFile, JSONProcessing
>>> from pathlib import Path
>>> with tempfile.TemporaryDirectory() as tmpdir:
... json_file = Path(tmpdir) / "data.json"
... _ = json_file.write_text(json.dumps({"old_key": "value"}))
... data_file = DataFile(
... name="json_data",
... fpath=json_file,
... proc_spec=JSONProcessing(key_mapping={"old_key": "new_key"})
... )
... print(data_file.proc_spec.key_mapping)
{'old_key': 'new_key'}
HDF5 Files¶
Work with HDF5 format files:
>>> import tempfile
>>> import h5py
>>> import numpy as np
>>> from r2x_core import DataFile, H5Format, ReaderConfig
>>> from pathlib import Path
>>> with tempfile.TemporaryDirectory() as tmpdir:
... h5_path = Path(tmpdir) / "data.h5"
... with h5py.File(h5_path, "w") as f:
... _ = f.create_dataset("load_profiles", data=np.array([1, 2, 3]))
... h5_file = DataFile(
... name="timeseries",
... fpath=h5_path,
... reader=ReaderConfig(kwargs={"key": "load_profiles"})
... )
... print(type(h5_file.file_type).__name__)
H5Format
File Format Detection¶
Automatically detect and work with different file formats using FileFormat:
>>> import tempfile
>>> from pathlib import Path
>>> from r2x_core import DataFile, FileFormat
>>> with tempfile.TemporaryDirectory() as tmpdir:
... csv_path = Path(tmpdir) / "data.csv"
... _ = csv_path.write_text("a,b\n1,2\n")
... csv_file = DataFile(name="csv_data", fpath=csv_path)
... # FileFormat is the base class for all file format types
... isinstance(csv_file.file_type, FileFormat)
True
See Also¶
Read Data Files - Reading files
Configuring Data Settings - Configuration management