Source code for compass.extraction.apply
"""Ordinance function to apply ordinance extraction on a document"""
import logging
from warnings import warn
from compass.llm import JSONFromTextLLMCaller
from compass.extraction.date import DateExtractor
from compass.validation import (
ParseChunksWithMemory,
LegalTextValidator,
parse_by_chunks,
)
from compass.utilities.ngrams import sentence_ngram_containment
from compass.warn import COMPASSWarning
logger = logging.getLogger(__name__)
# Multiplier used to consider text output from LLM to be hallucination
_TEXT_OUT_CHAR_BUFFER = 1.05
[docs]
async def check_for_relevant_text(
doc,
model_config,
heuristic,
tech,
text_collectors,
usage_tracker=None,
min_chunks_to_process=3,
):
"""Parse a single document for relevant text (e.g. ordinances)
The results of the text parsing are stored in the documents attrs
under the respective text collector label.
Parameters
----------
doc : BaseDocument
A document instance (PDF, HTML, etc) potentially containing
ordinance information. Note that if the document's ``attrs``
has the relevant text output, the corresponding text collector
will not be run. To force a document to be processed by this
function, remove all previously collected text from the
document's ``attrs``.
model_config : compass.llm.config.LLMConfig
Configuration describing which LLM service, splitter, and call
parameters should be used for extraction.
heuristic : object
Domain-specific heuristic implementing a ``check`` method to
qualify text chunks for further processing.
tech : str
Technology of interest (e.g. "solar", "wind", etc). This is
used to set up some document validation decision trees.
text_collectors : iterable
Iterable of text collector classes to run during document
parsing. Each class must implement the
:class:`compass.plugin.interface.BaseTextCollector` interface.
If the document already contains text collected by a given
collector (i.e. the collector's ``OUT_LABEL`` is found in
``doc.attrs``), that collector will be skipped.
usage_tracker : UsageTracker, optional
Optional tracker instance to monitor token usage during
LLM calls. By default, ``None``.
min_chunks_to_process : int, optional
Minimum number of chunks to process before aborting due to text
failing the heuristic or deemed not legal (if applicable).
By default, ``3``.
Returns
-------
bool
``True`` if any text was collected by any of the text collectors
and ``False`` otherwise.
Notes
-----
The function updates progress bar logging as chunks are processed.
"""
chunks = model_config.text_splitter.split_text(doc.text)
chunk_parser = ParseChunksWithMemory(chunks, num_to_recall=2)
legal_text_validator = (
LegalTextValidator(
tech=tech,
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
doc_is_from_ocr=doc.attrs.get("from_ocr", False),
**model_config.llm_call_kwargs,
)
if doc.attrs.get("check_if_legal_doc", True)
else None
)
collectors_to_run = []
callbacks = []
for collector_class in text_collectors:
if collector_class is None or collector_class.OUT_LABEL in doc.attrs:
continue
collector = collector_class(
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
**model_config.llm_call_kwargs,
)
collectors_to_run.append(collector)
callbacks.append(collector.check_chunk)
if not collectors_to_run:
logger.debug(
"No text collectors to run for document from %s",
doc.attrs.get("source", "unknown source"),
)
return False
await parse_by_chunks(
chunk_parser,
heuristic,
legal_text_validator,
callbacks=callbacks,
min_chunks_to_process=min_chunks_to_process,
)
found_text = False
for collector in collectors_to_run:
if text := collector.relevant_text:
found_text = True
doc.attrs[collector.OUT_LABEL] = text
logger.debug_to_file(
"%r text for %s is:\n%s",
collector.OUT_LABEL,
doc.attrs.get("source", "unknown source"),
text,
)
return found_text
[docs]
async def extract_date(doc, model_config, usage_tracker=None):
"""Parse a single document for date information
Parameters
----------
doc : BaseDocument
A document potentially containing date information.
model_config : compass.llm.config.LLMConfig
Configuration describing which LLM service, splitter, and call
parameters should be used for date extraction.
usage_tracker : UsageTracker, optional
Optional tracker instance to monitor token usage during
LLM calls. By default, ``None``.
Returns
-------
BaseDocument
Document that has been parsed for dates. The results of
the parsing are stored in the documents attrs. In particular,
the attrs will contain a ``"date"`` key that will contain the
parsed date information.
Notes
-----
Documents already containing a ``"date"`` attribute are returned
without reprocessing.
"""
if "date" in doc.attrs:
logger.debug(
"Not extracting date for doc from %s. "
"Found existing date in doc attrs: %r",
doc.attrs.get("source"),
doc.attrs["date"],
)
return doc
date_llm_caller = JSONFromTextLLMCaller(
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
**model_config.llm_call_kwargs,
)
doc.attrs["date"] = await DateExtractor(
date_llm_caller, model_config.text_splitter
).parse(doc)
return doc
[docs]
async def extract_relevant_text_with_llm(
doc, text_splitter, extractor, original_text_key
):
"""Extract ordinance text from document using LLM
Parameters
----------
doc : BaseDocument
A document known to contain ordinance information. This means it
must contain the `original_text_key` key in the attrs. You can
run :func:`check_for_relevant_text` to have this attribute
populated automatically for documents that are found to contain
relevant extraction text. Note that if the document's attrs does
not contain the `original_text_key`, you will get an error.
text_splitter : LCTextSplitter, optional
Optional Langchain text splitter (or subclass instance), or any
object that implements a `split_text` method. The method should
take text as input (str) and return a list of text chunks.
extractor : object
Extractor instance exposing ``parsers`` that consume text
chunks and update ``doc.attrs``.
original_text_key : str
String corresponding to the `doc.attrs` key containing the
original text (before extraction).
Returns
-------
BaseDocument
Document that has been parsed for ordinance text. The results of
the extraction are stored in the document's attrs.
str
Key corresponding to the cleaned ordinance text stored in the
`doc.attrs` dictionary.
"""
prev_meta_name = original_text_key
for meta_name, parser in extractor.parsers:
doc.attrs[meta_name] = await _parse_if_input_text_not_empty(
doc.attrs[prev_meta_name],
text_splitter,
parser,
prev_meta_name,
meta_name,
)
prev_meta_name = meta_name
return doc, prev_meta_name
[docs]
async def extract_relevant_text_with_ngram_validation(
doc,
text_splitter,
extractor,
original_text_key,
n=4,
num_extraction_attempts=3,
ngram_fraction_threshold=0.9,
ngram_ocr_fraction_threshold=0.75,
):
"""Extract ordinance text for a single document with known ord info
This extraction includes an "ngram" check, which attempts to detect
whether or not the cleaned text was extracted from the original
ordinance text. The processing will attempt to re-extract the text
if the validation does not pass a certain threshold until the
maximum number of attempts is reached. If the text still does not
pass validation at this point, there is a good chance that the LLM
hallucinated parts of the output text, so caution should be taken.
Parameters
----------
doc : BaseDocument
A document known to contain ordinance information. This means it
must contain an ``"relevant_text"`` key in the attrs. You can
run :func:`~compass.extraction.apply.check_for_relevant_text`
to have this attribute populated automatically for documents
that are found to contain ordinance data. Note that if the
document's attrs does not contain the ``"relevant_text"``
key, it will not be processed.
text_splitter : LCTextSplitter, optional
Optional Langchain text splitter (or subclass instance), or any
object that implements a `split_text` method. The method should
take text as input (str) and return a list of text chunks.
extractor : object
Extractor instance exposing ``parsers`` that consume text
chunks and update ``doc.attrs``.
original_text_key : str
String corresponding to the `doc.attrs` key containing the
original text (before extraction).
n : int, optional
Number of words to include per ngram for the ngram validation,
which helps ensure that the LLM did not hallucinate.
By default, ``4``.
num_extraction_attempts : int, optional
Number of extraction attempts before returning text that did not
pass the ngram check. If the processing exceeds this value,
there is a good chance that the LLM hallucinated parts of the
output text. Cannot be negative or 0. By default, ``3``.
ngram_fraction_threshold : float, optional
Fraction of ngrams in the cleaned text that are also found in
the original ordinance text (parsed using poppler) for the
extraction to be considered successful. Should be a value
between 0 and 1 (inclusive). By default, ``0.9``.
ngram_ocr_fraction_threshold : float, optional
Fraction of ngrams in the cleaned text that are also found in
the original ordinance text (parsed using OCR) for the
extraction to be considered successful. Should be a value
between 0 and 1 (inclusive). By default, ``0.75``.
Returns
-------
BaseDocument
Document that has been parsed for ordinance text. The results of
the extraction are stored in the document's attrs.
"""
if not doc.attrs.get(original_text_key):
msg = (
f"Input document has no {original_text_key!r} key or string "
"does not contain information. Please run "
"`compass.extraction.check_for_relevant_text()` with the proper "
"text collector prior to calling this method."
)
warn(msg, COMPASSWarning)
return doc
return await _extract_with_ngram_check(
doc,
text_splitter,
extractor,
original_text_key,
n=max(1, n),
num_tries=max(1, num_extraction_attempts),
ngram_fraction_threshold=max(0, min(1, ngram_fraction_threshold)),
ngram_ocr_fraction_threshold=max(
0, min(1, ngram_ocr_fraction_threshold)
),
)
async def _extract_with_ngram_check(
doc,
text_splitter,
extractor,
original_text_key,
n=4,
num_tries=3,
ngram_fraction_threshold=0.9,
ngram_ocr_fraction_threshold=0.75,
):
"""Extract ordinance info from doc and validate using ngrams."""
source = doc.attrs.get("source", "Unknown")
doc_is_from_ocr = doc.attrs.get("from_ocr", False)
original_text = doc.attrs[original_text_key]
if not original_text:
msg = (
"Document missing original ordinance text! No extraction "
f"performed (Document source: {source})"
)
warn(msg, COMPASSWarning)
return doc
ngram_thresh = (
ngram_ocr_fraction_threshold
if doc_is_from_ocr
else ngram_fraction_threshold
)
best_score = 0
for attempt in range(1, num_tries + 1):
doc, out_text_key = await extract_relevant_text_with_llm(
doc, text_splitter, extractor, original_text_key
)
cleaned_text = doc.attrs[out_text_key]
if not cleaned_text:
logger.debug(
"No cleaned text found after extraction on attempt %d "
"of %d for document with source %s. Retrying...",
attempt,
num_tries,
source,
)
continue
ngram_frac = sentence_ngram_containment(
original=original_text, test=cleaned_text, n=n
)
if ngram_frac >= ngram_thresh:
logger.debug(
"Document extraction for %r passed ngram check on attempt %d "
"of %d with score %.2f (OCR: %r; Document source: %s)",
out_text_key,
attempt,
num_tries,
ngram_frac,
doc_is_from_ocr,
source,
)
best_score = ngram_frac
break
best_score = max(best_score, ngram_frac)
logger.debug(
"Document extraction for %r failed ngram check on attempt %d of "
"%d, with score %.2f (OCR: %r; Document source: %s). Retrying...",
out_text_key,
attempt,
num_tries,
ngram_frac,
doc_is_from_ocr,
source,
)
else:
msg = (
f"Ngram check failed after {num_tries} tries trying to extract "
f"{original_text_key!r}. Not returning any extracted text due to "
"high possibility of LLM hallucination! "
f"(Best score: {best_score:.2f}; OCR: {doc_is_from_ocr}; "
f"Document source: {source})"
)
warn(msg, COMPASSWarning)
return doc
doc.attrs[f"{original_text_key}_ngram_score"] = best_score
return doc
[docs]
async def extract_ordinance_values(doc, parser, text_key, out_key):
"""Extract ordinance values for a single document
Document must be known to contain ordinance text.
Parameters
----------
doc : BaseDocument
A document known to contain ordinance text. This means it must
contain an `text_key` key in the attrs. You can run
:func:`~compass.extraction.apply.extract_relevant_text_with_llm`
to have this attribute populated automatically for documents
that are found to contain ordinance data. Note that if the
document's attrs does not contain the `text_key` key, it will
not be processed.
parser : object
Parser instance with an async ``parse`` method that converts
cleaned ordinance text into structured values.
text_key : str
Name of the key under which cleaned text is stored in
`doc.attrs`. This text should be ready for extraction.
out_key : str
Name of the key under which extracted ordinances should be
stored.
Returns
-------
BaseDocument
Document that has been parsed for ordinance values. The results
of the extraction are stored in the document's attrs.
Notes
-----
When the cleaned text is missing or empty the function emits a
:class:`compass.warn.COMPASSWarning` and leaves ``doc`` unchanged.
"""
if not doc.attrs.get(text_key):
msg = (
f"Input document has no {text_key!r} key or string "
"does not contain info. Please run "
"`extract_relevant_text_with_llm` prior to calling this method."
)
warn(msg, COMPASSWarning)
return doc
doc.attrs[out_key] = await parser.parse(doc.attrs[text_key])
return doc
async def _parse_if_input_text_not_empty(
text, text_splitter, parser, curr_text_name, next_text_name
):
"""Extract text using parser, or return empty if input empty"""
if not text:
msg = (
f"{curr_text_name!r} does not contain any text. Skipping "
f"extraction for {next_text_name!r}"
)
warn(msg, COMPASSWarning)
return text
text_chunks = text_splitter.split_text(text)
extracted_text = await parser(text_chunks)
if len(extracted_text) > _TEXT_OUT_CHAR_BUFFER * len(text):
logger.debug(
"LLM output more text than was given (IN: %d, OUT: %d). "
"Throwing away response due to possible hallucination...",
len(text),
len(extracted_text),
)
return ""
logger.debug_to_file(
"Extracted text for %r is:\n%s", next_text_name, extracted_text
)
return extracted_text