Source code for compass.extraction.context

"""Extraction context for multi-document ordinance extraction"""

from textwrap import shorten
from collections.abc import Iterable

import pandas as pd

from compass.services.threaded import FileMover
from compass.exceptions import COMPASSTypeError


[docs] class ExtractionContext: """Context for extraction operations supporting multiple documents This class provides a Document-compatible interface for extraction workflows that may involve one or more source documents. It tracks chunk-level provenance to identify which document each text chunk originated from, while maintaining compatibility with existing extraction functions that expect Document-like objects """ def __init__(self, documents=None, attrs=None): """ Parameters ---------- documents : sequence of BaseDocument, optional One or more source documents contributing to this context. For single-document workflows (solar, wind), pass a list with one document. For multi-document workflows (water rights), pass all contributing documents attrs : dict, optional Context-level attributes for extraction metadata (jurisdiction, tech type, etc.). By default, ``None`` """ self.attrs = attrs or {} self._documents = _as_list(documents) self._data_docs = [] @property def text(self): """str: Concatenated text from all documents""" return self.multi_doc_context() @property def pages(self): """list: Concatenated pages from all documents""" return [page for doc in self.documents for page in doc.pages] @property def num_documents(self): """int: Number of source documents in this context""" return len(self.documents) @property def documents(self): """list: List of documents that might contain relevant info""" return self._documents @documents.setter def documents(self, other): self._documents = _as_list(other) @property def data_docs(self): """list: List of documents that contributed to extraction""" return self._data_docs @data_docs.setter def data_docs(self, other): if not isinstance(other, list): msg = "data_docs must be set to a *list* of documents" raise COMPASSTypeError(msg) self._data_docs = other def __str__(self): header = ( f"{self.__class__.__name__} with {self.num_documents:,} document" ) if self.num_documents != 1: header = f"{header}s" if self.num_documents > 0: docs = "\n\t- ".join( [ d.attrs.get("source", "Unknown source") for d in self.documents ] ) header = f"{header}:\n\t- {docs}" data_docs = _data_docs_repr(self.data_docs) attrs = _attrs_repr(self.attrs) return f"{header}\n{data_docs}\n{attrs}" def __len__(self): return self.num_documents def __getitem__(self, index): return self.documents[index] def __iter__(self): return iter(self.documents) def __bool__(self): return bool(self.documents)
[docs] async def mark_doc_as_data_source(self, doc, out_fn_stem=None): """Mark a document as a data source for extraction Parameters ---------- doc : BaseDocument Document to add as a data source out_fn_stem : str, optional Optional output filename stem for this document. If provided, the document file will be moved from the temporary directory to the output directory with this filename stem and appropriate file suffix. By default, ``None``. """ self._data_docs.append(doc) if out_fn_stem is not None: await _move_file_to_out_dir(doc, out_fn_stem)
[docs] def multi_doc_context(self, attr_text_key=None): """Get concatenated text representation of documents This method creates a concatenated text representation of the documents in this context, optionally pulling the text from the documents' `attr_text_key`. Parameters ---------- attr_text_key : str, optional The key used to look up the document's `.attrs` dictionary for the text to concatenate. If ``None``, the full document text is used for concatenation. Returns ------- str Concatenated text representation of the documents in this context. """ if not self.documents: return "" serialized = "\n\n".join( ( f"# SOURCE INDEX #: {ind}\n" f"# CONTENT #:\n{_text_from_doc(doc, attr_text_key)}" ) for ind, doc in enumerate(self.documents) ) return f"## MULTI-DOCUMENT CONTEXT ##\n\n{serialized}"
async def _move_file_to_out_dir(doc, out_fn): """Move PDF or HTML text file to output directory""" out_fp = await FileMover.call(doc, out_fn) doc.attrs["out_fp"] = out_fp return doc def _as_list(documents): """Convert input to list""" if documents is None: return [] if not isinstance(documents, Iterable): return [documents] return list(documents) def _data_docs_repr(data_docs): """String representation of data source documents""" if not data_docs: return "Registered Data Source Documents: None" data_docs = "\n\t- ".join( [d.attrs.get("source", "Unknown source") for d in data_docs] ) return f"Registered Data Source Documents:\n\t- {data_docs}" def _attrs_repr(attrs): """String representation of context attributes""" if not attrs: return "Attrs: None" attrs = { k: ( f"DataFrame with {len(v):,} rows" if isinstance(v, pd.DataFrame) else v ) for k, v in attrs.items() } indent = max(len(k) for k in attrs) + 2 width = max(10, 80 - (indent + 4)) to_join = [] for k, v in attrs.items(): v_str = str(v) if "\n" in v_str: v_str = shorten(v_str, width=width) to_join.append(f"{k:>{indent}}:\t{v_str}") attrs = "\n".join(to_join) return f"Attrs:\n{attrs}" def _text_from_doc(doc, key): """Get text from key or full doc""" return doc.text if key is None else doc.attrs[key]