Source code for compass.extraction.apply

"""Ordinance function to apply ordinance extraction on a document"""

import logging
from warnings import warn

from compass.llm import JSONFromTextLLMCaller
from compass.extraction.date import DateExtractor
from compass.validation import (
    ParseChunksWithMemory,
    LegalTextValidator,
    parse_by_chunks,
)
from compass.utilities.ngrams import sentence_ngram_containment
from compass.warn import COMPASSWarning


logger = logging.getLogger(__name__)
# Multiplier used to consider text output from LLM to be hallucination
_TEXT_OUT_CHAR_BUFFER = 1.05



[docs]
async def check_for_relevant_text(
    doc,
    model_config,
    heuristic,
    tech,
    text_collectors,
    usage_tracker=None,
    min_chunks_to_process=3,
):
    """Parse a single document for relevant text (e.g. ordinances)

    The results of the text parsing are stored in the documents attrs
    under the respective text collector label.

    Parameters
    ----------
    doc : BaseDocument
        A document instance (PDF, HTML, etc) potentially containing
        ordinance information. Note that if the document's ``attrs``
        has the relevant text output, the corresponding text collector
        will not be run. To force a document to be processed by this
        function, remove all previously collected text from the
        document's ``attrs``.
    model_config : compass.llm.config.LLMConfig
        Configuration describing which LLM service, splitter, and call
        parameters should be used for extraction.
    heuristic : object
        Domain-specific heuristic implementing a ``check`` method to
        qualify text chunks for further processing.
    tech : str
        Technology of interest (e.g. "solar", "wind", etc). This is
        used to set up some document validation decision trees.
    text_collectors : iterable
        Iterable of text collector classes to run during document
        parsing. Each class must implement the
        :class:`compass.plugin.interface.BaseTextCollector` interface.
        If the document already contains text collected by a given
        collector (i.e. the collector's ``OUT_LABEL`` is found in
        ``doc.attrs``), that collector will be skipped.
    usage_tracker : UsageTracker, optional
        Optional tracker instance to monitor token usage during
        LLM calls. By default, ``None``.
    min_chunks_to_process : int, optional
        Minimum number of chunks to process before aborting due to text
        failing the heuristic or deemed not legal (if applicable).
        By default, ``3``.

    Returns
    -------
    bool
        ``True`` if any text was collected by any of the text collectors
        and ``False`` otherwise.

    Notes
    -----
    The function updates progress bar logging as chunks are processed.
    """
    chunks = model_config.text_splitter.split_text(doc.text)
    chunk_parser = ParseChunksWithMemory(chunks, num_to_recall=2)
    legal_text_validator = (
        LegalTextValidator(
            tech=tech,
            llm_service=model_config.llm_service,
            usage_tracker=usage_tracker,
            doc_is_from_ocr=doc.attrs.get("from_ocr", False),
            **model_config.llm_call_kwargs,
        )
        if doc.attrs.get("check_if_legal_doc", True)
        else None
    )

    collectors_to_run = []
    callbacks = []
    for collector_class in text_collectors:
        if collector_class is None or collector_class.OUT_LABEL in doc.attrs:
            continue

        collector = collector_class(
            llm_service=model_config.llm_service,
            usage_tracker=usage_tracker,
            **model_config.llm_call_kwargs,
        )
        collectors_to_run.append(collector)
        callbacks.append(collector.check_chunk)

    if not collectors_to_run:
        logger.debug(
            "No text collectors to run for document from %s",
            doc.attrs.get("source", "unknown source"),
        )
        return False

    await parse_by_chunks(
        chunk_parser,
        heuristic,
        legal_text_validator,
        callbacks=callbacks,
        min_chunks_to_process=min_chunks_to_process,
    )

    found_text = False
    for collector in collectors_to_run:
        if text := collector.relevant_text:
            found_text = True
            doc.attrs[collector.OUT_LABEL] = text
            logger.debug_to_file(
                "%r text for %s is:\n%s",
                collector.OUT_LABEL,
                doc.attrs.get("source", "unknown source"),
                text,
            )

    return found_text




[docs]
async def extract_date(doc, model_config, usage_tracker=None):
    """Parse a single document for date information

    Parameters
    ----------
    doc : BaseDocument
        A document potentially containing date information.
    model_config : compass.llm.config.LLMConfig
        Configuration describing which LLM service, splitter, and call
        parameters should be used for date extraction.
    usage_tracker : UsageTracker, optional
        Optional tracker instance to monitor token usage during
        LLM calls. By default, ``None``.

    Returns
    -------
    BaseDocument
        Document that has been parsed for dates. The results of
        the parsing are stored in the documents attrs. In particular,
        the attrs will contain a ``"date"`` key that will contain the
        parsed date information.

    Notes
    -----
    Documents already containing a ``"date"`` attribute are returned
    without reprocessing.
    """
    if "date" in doc.attrs:
        logger.debug(
            "Not extracting date for doc from %s. "
            "Found existing date in doc attrs: %r",
            doc.attrs.get("source"),
            doc.attrs["date"],
        )
        return doc

    date_llm_caller = JSONFromTextLLMCaller(
        llm_service=model_config.llm_service,
        usage_tracker=usage_tracker,
        **model_config.llm_call_kwargs,
    )
    doc.attrs["date"] = await DateExtractor(
        date_llm_caller, model_config.text_splitter
    ).parse(doc)

    return doc




[docs]
async def extract_relevant_text_with_llm(
    doc, text_splitter, extractor, original_text_key
):
    """Extract ordinance text from document using LLM

    Parameters
    ----------
    doc : BaseDocument
        A document known to contain ordinance information. This means it
        must contain the `original_text_key` key in the attrs. You can
        run :func:`check_for_relevant_text` to have this attribute
        populated automatically for documents that are found to contain
        relevant extraction text. Note that if the document's attrs does
        not contain the `original_text_key`, you will get an error.
    text_splitter : LCTextSplitter, optional
        Optional Langchain text splitter (or subclass instance), or any
        object that implements a `split_text` method. The method should
        take text as input (str) and return a list of text chunks.
    extractor : object
        Extractor instance exposing ``parsers`` that consume text
        chunks and update ``doc.attrs``.
    original_text_key : str
        String corresponding to the `doc.attrs` key containing the
        original text (before extraction).

    Returns
    -------
    BaseDocument
        Document that has been parsed for ordinance text. The results of
        the extraction are stored in the document's attrs.
    str
        Key corresponding to the cleaned ordinance text stored in the
        `doc.attrs` dictionary.

    """
    prev_meta_name = original_text_key
    for meta_name, parser in extractor.parsers:
        doc.attrs[meta_name] = await _parse_if_input_text_not_empty(
            doc.attrs[prev_meta_name],
            text_splitter,
            parser,
            prev_meta_name,
            meta_name,
        )
        prev_meta_name = meta_name

    return doc, prev_meta_name




[docs]
async def extract_relevant_text_with_ngram_validation(
    doc,
    text_splitter,
    extractor,
    original_text_key,
    n=4,
    num_extraction_attempts=3,
    ngram_fraction_threshold=0.9,
    ngram_ocr_fraction_threshold=0.75,
):
    """Extract ordinance text for a single document with known ord info

    This extraction includes an "ngram" check, which attempts to detect
    whether or not the cleaned text was extracted from the original
    ordinance text. The processing will attempt to re-extract the text
    if the validation does not pass a certain threshold until the
    maximum number of attempts is reached. If the text still does not
    pass validation at this point, there is a good chance that the LLM
    hallucinated parts of the output text, so caution should be taken.

    Parameters
    ----------
    doc : BaseDocument
        A document known to contain ordinance information. This means it
        must contain an ``"relevant_text"`` key in the attrs. You can
        run :func:`~compass.extraction.apply.check_for_relevant_text`
        to have this attribute populated automatically for documents
        that are found to contain ordinance data. Note that if the
        document's attrs does not contain the ``"relevant_text"``
        key, it will not be processed.
    text_splitter : LCTextSplitter, optional
        Optional Langchain text splitter (or subclass instance), or any
        object that implements a `split_text` method. The method should
        take text as input (str) and return a list of text chunks.
    extractor : object
        Extractor instance exposing ``parsers`` that consume text
        chunks and update ``doc.attrs``.
    original_text_key : str
        String corresponding to the `doc.attrs` key containing the
        original text (before extraction).
    n : int, optional
        Number of words to include per ngram for the ngram validation,
        which helps ensure that the LLM did not hallucinate.
        By default, ``4``.
    num_extraction_attempts : int, optional
        Number of extraction attempts before returning text that did not
        pass the ngram check. If the processing exceeds this value,
        there is a good chance that the LLM hallucinated parts of the
        output text. Cannot be negative or 0. By default, ``3``.
    ngram_fraction_threshold : float, optional
        Fraction of ngrams in the cleaned text that are also found in
        the original ordinance text (parsed using poppler) for the
        extraction to be considered successful. Should be a value
        between 0 and 1 (inclusive). By default, ``0.9``.
    ngram_ocr_fraction_threshold : float, optional
        Fraction of ngrams in the cleaned text that are also found in
        the original ordinance text (parsed using OCR) for the
        extraction to be considered successful. Should be a value
        between 0 and 1 (inclusive). By default, ``0.75``.

    Returns
    -------
    BaseDocument
        Document that has been parsed for ordinance text. The results of
        the extraction are stored in the document's attrs.
    """
    if not doc.attrs.get(original_text_key):
        msg = (
            f"Input document has no {original_text_key!r} key or string "
            "does not contain information. Please run "
            "`compass.extraction.check_for_relevant_text()` with the proper "
            "text collector prior to calling this method."
        )
        warn(msg, COMPASSWarning)
        return doc

    return await _extract_with_ngram_check(
        doc,
        text_splitter,
        extractor,
        original_text_key,
        n=max(1, n),
        num_tries=max(1, num_extraction_attempts),
        ngram_fraction_threshold=max(0, min(1, ngram_fraction_threshold)),
        ngram_ocr_fraction_threshold=max(
            0, min(1, ngram_ocr_fraction_threshold)
        ),
    )



async def _extract_with_ngram_check(
    doc,
    text_splitter,
    extractor,
    original_text_key,
    n=4,
    num_tries=3,
    ngram_fraction_threshold=0.9,
    ngram_ocr_fraction_threshold=0.75,
):
    """Extract ordinance info from doc and validate using ngrams."""

    source = doc.attrs.get("source", "Unknown")
    doc_is_from_ocr = doc.attrs.get("from_ocr", False)
    original_text = doc.attrs[original_text_key]
    if not original_text:
        msg = (
            "Document missing original ordinance text! No extraction "
            f"performed (Document source: {source})"
        )
        warn(msg, COMPASSWarning)
        return doc

    ngram_thresh = (
        ngram_ocr_fraction_threshold
        if doc_is_from_ocr
        else ngram_fraction_threshold
    )

    best_score = 0
    for attempt in range(1, num_tries + 1):
        doc, out_text_key = await extract_relevant_text_with_llm(
            doc, text_splitter, extractor, original_text_key
        )
        cleaned_text = doc.attrs[out_text_key]
        if not cleaned_text:
            logger.debug(
                "No cleaned text found after extraction on attempt %d "
                "of %d for document with source %s. Retrying...",
                attempt,
                num_tries,
                source,
            )
            continue

        ngram_frac = sentence_ngram_containment(
            original=original_text, test=cleaned_text, n=n
        )
        if ngram_frac >= ngram_thresh:
            logger.debug(
                "Document extraction for %r passed ngram check on attempt %d "
                "of %d with score %.2f (OCR: %r; Document source: %s)",
                out_text_key,
                attempt,
                num_tries,
                ngram_frac,
                doc_is_from_ocr,
                source,
            )
            best_score = ngram_frac
            break

        best_score = max(best_score, ngram_frac)

        logger.debug(
            "Document extraction for %r failed ngram check on attempt %d of "
            "%d, with score %.2f (OCR: %r; Document source: %s). Retrying...",
            out_text_key,
            attempt,
            num_tries,
            ngram_frac,
            doc_is_from_ocr,
            source,
        )
    else:
        msg = (
            f"Ngram check failed after {num_tries} tries trying to extract "
            f"{original_text_key!r}. Not returning any extracted text due to "
            "high possibility of LLM hallucination! "
            f"(Best score: {best_score:.2f}; OCR: {doc_is_from_ocr}; "
            f"Document source: {source})"
        )
        warn(msg, COMPASSWarning)
        return doc

    doc.attrs[f"{original_text_key}_ngram_score"] = best_score
    return doc



[docs]
async def extract_ordinance_values(doc, parser, text_key, out_key):
    """Extract ordinance values for a single document

    Document must be known to contain ordinance text.

    Parameters
    ----------
    doc : BaseDocument
        A document known to contain ordinance text. This means it must
        contain an `text_key` key in the attrs. You can run
        :func:`~compass.extraction.apply.extract_relevant_text_with_llm`
        to have this attribute populated automatically for documents
        that are found to contain ordinance data. Note that if the
        document's attrs does not contain the `text_key` key, it will
        not be processed.
    parser : object
        Parser instance with an async ``parse`` method that converts
        cleaned ordinance text into structured values.
    text_key : str
        Name of the key under which cleaned text is stored in
        `doc.attrs`. This text should be ready for extraction.
    out_key : str
        Name of the key under which extracted ordinances should be
        stored.

    Returns
    -------
    BaseDocument
        Document that has been parsed for ordinance values. The results
        of the extraction are stored in the document's attrs.

    Notes
    -----
    When the cleaned text is missing or empty the function emits a
    :class:`compass.warn.COMPASSWarning` and leaves ``doc`` unchanged.
    """
    if not doc.attrs.get(text_key):
        msg = (
            f"Input document has no {text_key!r} key or string "
            "does not contain info. Please run "
            "`extract_relevant_text_with_llm` prior to calling this method."
        )
        warn(msg, COMPASSWarning)
        return doc

    doc.attrs[out_key] = await parser.parse(doc.attrs[text_key])
    return doc



async def _parse_if_input_text_not_empty(
    text, text_splitter, parser, curr_text_name, next_text_name
):
    """Extract text using parser, or return empty if input empty"""
    if not text:
        msg = (
            f"{curr_text_name!r} does not contain any text. Skipping "
            f"extraction for {next_text_name!r}"
        )
        warn(msg, COMPASSWarning)
        return text

    text_chunks = text_splitter.split_text(text)
    extracted_text = await parser(text_chunks)

    if len(extracted_text) > _TEXT_OUT_CHAR_BUFFER * len(text):
        logger.debug(
            "LLM output more text than was given (IN: %d, OUT: %d). "
            "Throwing away response due to possible hallucination...",
            len(text),
            len(extracted_text),
        )
        return ""

    logger.debug_to_file(
        "Extracted text for %r is:\n%s", next_text_name, extracted_text
    )
    return extracted_text