Source code for compass.plugin.base
"""Base COMPASS extraction plugin interface"""
from abc import ABC, abstractmethod
from compass.pb import COMPASS_PB
from compass.services.threaded import UsageUpdater
from compass.utilities import compute_total_cost_from_usage
[docs]
class BaseExtractionPlugin(ABC):
"""Base class for COMPASS extraction plugins
This class provides the most extraction flexibility, but the
implementer must define most functionality on their own.
"""
def __init__(self, jurisdiction, model_configs, usage_tracker=None):
"""
Parameters
----------
jurisdiction : Jurisdiction
Jurisdiction for which extraction is being performed.
model_configs : dict
Dictionary where keys are
:class:`~compass.utilities.enums.LLMTasks` and values are
:class:`~compass.llm.config.LLMConfig` instances to be used
for those tasks.
usage_tracker : UsageTracker, optional
Usage tracker instance that can be used to record the LLM
call cost. By default, ``None``.
"""
self.jurisdiction = jurisdiction
self.model_configs = model_configs
self.usage_tracker = usage_tracker
JURISDICTION_DATA_FP = None
""":term:`path-like <path-like object>`: Path to jurisdiction CSV
If provided, this CSV will extend the known jurisdictions (by
default, US states, counties, and townships). This CSV must have the
following columns:
- State: The state in which the jurisdiction is located
(e.g. "Texas")
- County: The county in which the jurisdiction is located
(e.g. "Travis"). This can be left blank if the jurisdiction is
not associated with a county.
- Subdivision: The name of the subdivision of the county in
which the jurisdiction is located. Use this input for
jurisdictions that do not map to counties/townships (e.g.
water conservation districts, resource management plan areas,
etc.). This can be left blank if the jurisdiction does not
have the notion of a "subdivision".
- Jurisdiction Type: The type of jurisdiction (e.g. "county",
"township", "city", "special district", "RMP", etc.).
- FIPS: The code to be used for the jurisdiction, if applicable
(e.g. "48453" for Travis County, Texas, "22" for the
Culberson County Groundwater Conservation District, etc.).
This can be left blank if the jurisdiction does not have an
applicable code.
- Website: The official website for the jurisdiction, if
applicable (e.g. "https://www.traviscountytx.gov/"). This can
be left blank if the jurisdiction does not have an official
website or if the website is not known.
"""
@property
@abstractmethod
def IDENTIFIER(self): # noqa: N802
"""str: Identifier for extraction task (e.g. "water rights")"""
raise NotImplementedError
[docs]
@abstractmethod
async def get_query_templates(self):
"""Get a list of search engine query templates for extraction
Query templates can contain the placeholder ``{jurisdiction}``
which will be replaced with the full jurisdiction name during
the search engine query.
"""
raise NotImplementedError
[docs]
@abstractmethod
async def get_website_keywords(self):
"""Get a dict of website search keyword scores
Dictionary mapping keywords to scores that indicate links which
should be prioritized when performing a website scrape for a
document.
"""
raise NotImplementedError
[docs]
@abstractmethod
async def get_heuristic(self):
"""Get a `BaseHeuristic` instance with a `check()` method
The ``check()`` method should accept a string of text and return
``True`` if the text passes the heuristic check and ``False``
otherwise.
"""
raise NotImplementedError
[docs]
@abstractmethod
async def filter_docs(
self, extraction_context, need_jurisdiction_verification=True
):
"""Filter down candidate documents before parsing
Parameters
----------
extraction_context : ExtractionContext
Context containing candidate documents to be filtered.
Set the ``.documents`` attribute of this object to be the
iterable of documents that should be kept for parsing.
need_jurisdiction_verification : bool, optional
Whether to verify that documents pertain to the correct
jurisdiction. By default, ``True``.
Returns
-------
ExtractionContext
Context with filtered down documents.
"""
raise NotImplementedError
[docs]
@abstractmethod
async def parse_docs_for_structured_data(self, extraction_context):
"""Parse documents to extract structured data/information
Parameters
----------
extraction_context : ExtractionContext
Context containing candidate documents to parse.
Returns
-------
ExtractionContext or None
Context with extracted data/information stored in the
``.attrs`` dictionary, or ``None`` if no data was extracted.
"""
raise NotImplementedError
[docs]
@classmethod
@abstractmethod
def save_structured_data(cls, doc_infos, out_dir):
"""Write combined extracted structured data to disk
Parameters
----------
doc_infos : list of dict
List of dictionaries containing the following keys:
- "jurisdiction": An initialized Jurisdiction object
representing the jurisdiction that was extracted.
- "ord_db_fp": A path to the extracted structured data
stored on disk, or ``None`` if no data was extracted.
out_dir : path-like
Path to the output directory for the data.
Returns
-------
int
Number of jurisdictions for which data was successfully
extracted.
"""
raise NotImplementedError
[docs]
async def record_usage(self):
"""Persist usage tracking data when a tracker is available"""
if self.usage_tracker is None:
return
total_usage = await UsageUpdater.call(self.usage_tracker)
total_cost = compute_total_cost_from_usage(total_usage)
COMPASS_PB.update_total_cost(total_cost, replace=True)
def validate_plugin_configuration(self): # noqa: B027
"""[NOT PUBLIC API] Validate plugin is properly configured"""