Source code for compass.extraction.apply

"""Ordinance function to apply ordinance extraction on a document"""

import logging
from warnings import warn

from compass.llm import JSONFromTextLLMCaller
from compass.extraction.date import DateExtractor
from compass.validation import (
    ParseChunksWithMemory,
    LegalTextValidator,
    parse_by_chunks,
)
from compass.utilities.ngrams import sentence_ngram_containment
from compass.warn import COMPASSWarning


logger = logging.getLogger(__name__)
# Multiplier used to consider text output from LLM to be hallucination
_TEXT_OUT_CHAR_BUFFER = 1.05


[docs] async def check_for_relevant_text( doc, model_config, heuristic, tech, text_collectors, usage_tracker=None, min_chunks_to_process=3, ): """Parse a single document for relevant text (e.g. ordinances) The results of the text parsing are stored in the documents attrs under the respective text collector label. Parameters ---------- doc : BaseDocument A document instance (PDF, HTML, etc) potentially containing ordinance information. Note that if the document's ``attrs`` has the relevant text output, the corresponding text collector will not be run. To force a document to be processed by this function, remove all previously collected text from the document's ``attrs``. model_config : compass.llm.config.LLMConfig Configuration describing which LLM service, splitter, and call parameters should be used for extraction. heuristic : object Domain-specific heuristic implementing a ``check`` method to qualify text chunks for further processing. tech : str Technology of interest (e.g. "solar", "wind", etc). This is used to set up some document validation decision trees. text_collectors : iterable Iterable of text collector classes to run during document parsing. Each class must implement the :class:`compass.plugin.interface.BaseTextCollector` interface. If the document already contains text collected by a given collector (i.e. the collector's ``OUT_LABEL`` is found in ``doc.attrs``), that collector will be skipped. usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. min_chunks_to_process : int, optional Minimum number of chunks to process before aborting due to text failing the heuristic or deemed not legal (if applicable). By default, ``3``. Returns ------- bool ``True`` if any text was collected by any of the text collectors and ``False`` otherwise. Notes ----- The function updates progress bar logging as chunks are processed. """ chunks = model_config.text_splitter.split_text(doc.text) chunk_parser = ParseChunksWithMemory(chunks, num_to_recall=2) legal_text_validator = ( LegalTextValidator( tech=tech, llm_service=model_config.llm_service, usage_tracker=usage_tracker, doc_is_from_ocr=doc.attrs.get("from_ocr", False), **model_config.llm_call_kwargs, ) if doc.attrs.get("check_if_legal_doc", True) else None ) collectors_to_run = [] callbacks = [] for collector_class in text_collectors: if collector_class is None or collector_class.OUT_LABEL in doc.attrs: continue collector = collector_class( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, ) collectors_to_run.append(collector) callbacks.append(collector.check_chunk) if not collectors_to_run: logger.debug( "No text collectors to run for document from %s", doc.attrs.get("source", "unknown source"), ) return False await parse_by_chunks( chunk_parser, heuristic, legal_text_validator, callbacks=callbacks, min_chunks_to_process=min_chunks_to_process, ) found_text = False for collector in collectors_to_run: if text := collector.relevant_text: found_text = True doc.attrs[collector.OUT_LABEL] = text logger.debug_to_file( "%r text for %s is:\n%s", collector.OUT_LABEL, doc.attrs.get("source", "unknown source"), text, ) return found_text
[docs] async def extract_date(doc, model_config, usage_tracker=None): """Parse a single document for date information Parameters ---------- doc : BaseDocument A document potentially containing date information. model_config : compass.llm.config.LLMConfig Configuration describing which LLM service, splitter, and call parameters should be used for date extraction. usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. Returns ------- BaseDocument Document that has been parsed for dates. The results of the parsing are stored in the documents attrs. In particular, the attrs will contain a ``"date"`` key that will contain the parsed date information. Notes ----- Documents already containing a ``"date"`` attribute are returned without reprocessing. """ if "date" in doc.attrs: logger.debug( "Not extracting date for doc from %s. " "Found existing date in doc attrs: %r", doc.attrs.get("source"), doc.attrs["date"], ) return doc date_llm_caller = JSONFromTextLLMCaller( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, ) doc.attrs["date"] = await DateExtractor( date_llm_caller, model_config.text_splitter ).parse(doc) return doc
[docs] async def extract_relevant_text_with_llm( doc, text_splitter, extractor, original_text_key ): """Extract ordinance text from document using LLM Parameters ---------- doc : BaseDocument A document known to contain ordinance information. This means it must contain the `original_text_key` key in the attrs. You can run :func:`check_for_relevant_text` to have this attribute populated automatically for documents that are found to contain relevant extraction text. Note that if the document's attrs does not contain the `original_text_key`, you will get an error. text_splitter : LCTextSplitter, optional Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. extractor : object Extractor instance exposing ``parsers`` that consume text chunks and update ``doc.attrs``. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). Returns ------- BaseDocument Document that has been parsed for ordinance text. The results of the extraction are stored in the document's attrs. str Key corresponding to the cleaned ordinance text stored in the `doc.attrs` dictionary. """ prev_meta_name = original_text_key for meta_name, parser in extractor.parsers: doc.attrs[meta_name] = await _parse_if_input_text_not_empty( doc.attrs[prev_meta_name], text_splitter, parser, prev_meta_name, meta_name, ) prev_meta_name = meta_name return doc, prev_meta_name
[docs] async def extract_relevant_text_with_ngram_validation( doc, text_splitter, extractor, original_text_key, n=4, num_extraction_attempts=3, ngram_fraction_threshold=0.9, ngram_ocr_fraction_threshold=0.75, ): """Extract ordinance text for a single document with known ord info This extraction includes an "ngram" check, which attempts to detect whether or not the cleaned text was extracted from the original ordinance text. The processing will attempt to re-extract the text if the validation does not pass a certain threshold until the maximum number of attempts is reached. If the text still does not pass validation at this point, there is a good chance that the LLM hallucinated parts of the output text, so caution should be taken. Parameters ---------- doc : BaseDocument A document known to contain ordinance information. This means it must contain an ``"relevant_text"`` key in the attrs. You can run :func:`~compass.extraction.apply.check_for_relevant_text` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's attrs does not contain the ``"relevant_text"`` key, it will not be processed. text_splitter : LCTextSplitter, optional Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. extractor : object Extractor instance exposing ``parsers`` that consume text chunks and update ``doc.attrs``. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). n : int, optional Number of words to include per ngram for the ngram validation, which helps ensure that the LLM did not hallucinate. By default, ``4``. num_extraction_attempts : int, optional Number of extraction attempts before returning text that did not pass the ngram check. If the processing exceeds this value, there is a good chance that the LLM hallucinated parts of the output text. Cannot be negative or 0. By default, ``3``. ngram_fraction_threshold : float, optional Fraction of ngrams in the cleaned text that are also found in the original ordinance text (parsed using poppler) for the extraction to be considered successful. Should be a value between 0 and 1 (inclusive). By default, ``0.9``. ngram_ocr_fraction_threshold : float, optional Fraction of ngrams in the cleaned text that are also found in the original ordinance text (parsed using OCR) for the extraction to be considered successful. Should be a value between 0 and 1 (inclusive). By default, ``0.75``. Returns ------- BaseDocument Document that has been parsed for ordinance text. The results of the extraction are stored in the document's attrs. """ if not doc.attrs.get(original_text_key): msg = ( f"Input document has no {original_text_key!r} key or string " "does not contain information. Please run " "`compass.extraction.check_for_relevant_text()` with the proper " "text collector prior to calling this method." ) warn(msg, COMPASSWarning) return doc return await _extract_with_ngram_check( doc, text_splitter, extractor, original_text_key, n=max(1, n), num_tries=max(1, num_extraction_attempts), ngram_fraction_threshold=max(0, min(1, ngram_fraction_threshold)), ngram_ocr_fraction_threshold=max( 0, min(1, ngram_ocr_fraction_threshold) ), )
async def _extract_with_ngram_check( doc, text_splitter, extractor, original_text_key, n=4, num_tries=3, ngram_fraction_threshold=0.9, ngram_ocr_fraction_threshold=0.75, ): """Extract ordinance info from doc and validate using ngrams.""" source = doc.attrs.get("source", "Unknown") doc_is_from_ocr = doc.attrs.get("from_ocr", False) original_text = doc.attrs[original_text_key] if not original_text: msg = ( "Document missing original ordinance text! No extraction " f"performed (Document source: {source})" ) warn(msg, COMPASSWarning) return doc ngram_thresh = ( ngram_ocr_fraction_threshold if doc_is_from_ocr else ngram_fraction_threshold ) best_score = 0 for attempt in range(1, num_tries + 1): doc, out_text_key = await extract_relevant_text_with_llm( doc, text_splitter, extractor, original_text_key ) cleaned_text = doc.attrs[out_text_key] if not cleaned_text: logger.debug( "No cleaned text found after extraction on attempt %d " "of %d for document with source %s. Retrying...", attempt, num_tries, source, ) continue ngram_frac = sentence_ngram_containment( original=original_text, test=cleaned_text, n=n ) if ngram_frac >= ngram_thresh: logger.debug( "Document extraction for %r passed ngram check on attempt %d " "of %d with score %.2f (OCR: %r; Document source: %s)", out_text_key, attempt, num_tries, ngram_frac, doc_is_from_ocr, source, ) best_score = ngram_frac break best_score = max(best_score, ngram_frac) logger.debug( "Document extraction for %r failed ngram check on attempt %d of " "%d, with score %.2f (OCR: %r; Document source: %s). Retrying...", out_text_key, attempt, num_tries, ngram_frac, doc_is_from_ocr, source, ) else: msg = ( f"Ngram check failed after {num_tries} tries trying to extract " f"{original_text_key!r}. Not returning any extracted text due to " "high possibility of LLM hallucination! " f"(Best score: {best_score:.2f}; OCR: {doc_is_from_ocr}; " f"Document source: {source})" ) warn(msg, COMPASSWarning) return doc doc.attrs[f"{original_text_key}_ngram_score"] = best_score return doc
[docs] async def extract_ordinance_values(doc, parser, text_key, out_key): """Extract ordinance values for a single document Document must be known to contain ordinance text. Parameters ---------- doc : BaseDocument A document known to contain ordinance text. This means it must contain an `text_key` key in the attrs. You can run :func:`~compass.extraction.apply.extract_relevant_text_with_llm` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's attrs does not contain the `text_key` key, it will not be processed. parser : object Parser instance with an async ``parse`` method that converts cleaned ordinance text into structured values. text_key : str Name of the key under which cleaned text is stored in `doc.attrs`. This text should be ready for extraction. out_key : str Name of the key under which extracted ordinances should be stored. Returns ------- BaseDocument Document that has been parsed for ordinance values. The results of the extraction are stored in the document's attrs. Notes ----- When the cleaned text is missing or empty the function emits a :class:`compass.warn.COMPASSWarning` and leaves ``doc`` unchanged. """ if not doc.attrs.get(text_key): msg = ( f"Input document has no {text_key!r} key or string " "does not contain info. Please run " "`extract_relevant_text_with_llm` prior to calling this method." ) warn(msg, COMPASSWarning) return doc doc.attrs[out_key] = await parser.parse(doc.attrs[text_key]) return doc
async def _parse_if_input_text_not_empty( text, text_splitter, parser, curr_text_name, next_text_name ): """Extract text using parser, or return empty if input empty""" if not text: msg = ( f"{curr_text_name!r} does not contain any text. Skipping " f"extraction for {next_text_name!r}" ) warn(msg, COMPASSWarning) return text text_chunks = text_splitter.split_text(text) extracted_text = await parser(text_chunks) if len(extracted_text) > _TEXT_OUT_CHAR_BUFFER * len(text): logger.debug( "LLM output more text than was given (IN: %d, OUT: %d). " "Throwing away response due to possible hallucination...", len(text), len(extracted_text), ) return "" logger.debug_to_file( "Extracted text for %r is:\n%s", next_text_name, extracted_text ) return extracted_text