Source code for compass.extraction.context
"""Extraction context for multi-document ordinance extraction"""
from textwrap import shorten
from collections.abc import Iterable
import pandas as pd
from compass.services.threaded import FileMover
from compass.exceptions import COMPASSTypeError
[docs]
class ExtractionContext:
"""Context for extraction operations supporting multiple documents
This class provides a Document-compatible interface for extraction
workflows that may involve one or more source documents. It tracks
chunk-level provenance to identify which document each text chunk
originated from, while maintaining compatibility with existing
extraction functions that expect Document-like objects
"""
def __init__(self, documents=None, attrs=None):
"""
Parameters
----------
documents : sequence of BaseDocument, optional
One or more source documents contributing to this context.
For single-document workflows (solar, wind), pass a list
with one document. For multi-document workflows (water
rights), pass all contributing documents
attrs : dict, optional
Context-level attributes for extraction metadata
(jurisdiction, tech type, etc.). By default, ``None``
"""
self.attrs = attrs or {}
self._documents = _as_list(documents)
self._data_docs = []
@property
def text(self):
"""str: Concatenated text from all documents"""
return self.multi_doc_context()
@property
def pages(self):
"""list: Concatenated pages from all documents"""
return [page for doc in self.documents for page in doc.pages]
@property
def num_documents(self):
"""int: Number of source documents in this context"""
return len(self.documents)
@property
def documents(self):
"""list: List of documents that might contain relevant info"""
return self._documents
@documents.setter
def documents(self, other):
self._documents = _as_list(other)
@property
def data_docs(self):
"""list: List of documents that contributed to extraction"""
return self._data_docs
@data_docs.setter
def data_docs(self, other):
if not isinstance(other, list):
msg = "data_docs must be set to a *list* of documents"
raise COMPASSTypeError(msg)
self._data_docs = other
def __str__(self):
header = (
f"{self.__class__.__name__} with {self.num_documents:,} document"
)
if self.num_documents != 1:
header = f"{header}s"
if self.num_documents > 0:
docs = "\n\t- ".join(
[
d.attrs.get("source", "Unknown source")
for d in self.documents
]
)
header = f"{header}:\n\t- {docs}"
data_docs = _data_docs_repr(self.data_docs)
attrs = _attrs_repr(self.attrs)
return f"{header}\n{data_docs}\n{attrs}"
def __len__(self):
return self.num_documents
def __getitem__(self, index):
return self.documents[index]
def __iter__(self):
return iter(self.documents)
def __bool__(self):
return bool(self.documents)
[docs]
async def mark_doc_as_data_source(self, doc, out_fn_stem=None):
"""Mark a document as a data source for extraction
Parameters
----------
doc : BaseDocument
Document to add as a data source
out_fn_stem : str, optional
Optional output filename stem for this document. If
provided, the document file will be moved from the
temporary directory to the output directory with this
filename stem and appropriate file suffix.
By default, ``None``.
"""
self._data_docs.append(doc)
if out_fn_stem is not None:
await _move_file_to_out_dir(doc, out_fn_stem)
[docs]
def multi_doc_context(self, attr_text_key=None):
"""Get concatenated text representation of documents
This method creates a concatenated text representation of the
documents in this context, optionally pulling the text from the
documents' `attr_text_key`.
Parameters
----------
attr_text_key : str, optional
The key used to look up the document's `.attrs` dictionary
for the text to concatenate. If ``None``, the full document
text is used for concatenation.
Returns
-------
str
Concatenated text representation of the documents in this
context.
"""
if not self.documents:
return ""
serialized = "\n\n".join(
(
f"# SOURCE INDEX #: {ind}\n"
f"# CONTENT #:\n{_text_from_doc(doc, attr_text_key)}"
)
for ind, doc in enumerate(self.documents)
)
return f"## MULTI-DOCUMENT CONTEXT ##\n\n{serialized}"
async def _move_file_to_out_dir(doc, out_fn):
"""Move PDF or HTML text file to output directory"""
out_fp = await FileMover.call(doc, out_fn)
doc.attrs["out_fp"] = out_fp
return doc
def _as_list(documents):
"""Convert input to list"""
if documents is None:
return []
if not isinstance(documents, Iterable):
return [documents]
return list(documents)
def _data_docs_repr(data_docs):
"""String representation of data source documents"""
if not data_docs:
return "Registered Data Source Documents: None"
data_docs = "\n\t- ".join(
[d.attrs.get("source", "Unknown source") for d in data_docs]
)
return f"Registered Data Source Documents:\n\t- {data_docs}"
def _attrs_repr(attrs):
"""String representation of context attributes"""
if not attrs:
return "Attrs: None"
attrs = {
k: (
f"DataFrame with {len(v):,} rows"
if isinstance(v, pd.DataFrame)
else v
)
for k, v in attrs.items()
}
indent = max(len(k) for k in attrs) + 2
width = max(10, 80 - (indent + 4))
to_join = []
for k, v in attrs.items():
v_str = str(v)
if "\n" in v_str:
v_str = shorten(v_str, width=width)
to_join.append(f"{k:>{indent}}:\t{v_str}")
attrs = "\n".join(to_join)
return f"Attrs:\n{attrs}"
def _text_from_doc(doc, key):
"""Get text from key or full doc"""
return doc.text if key is None else doc.attrs[key]