"""Ordinance file downloading logic"""
import pprint
import logging
from contextlib import AsyncExitStack
from elm.web.search.run import load_docs, search_with_fallback
from elm.web.website_crawl import (
_SCORE_KEY, # noqa: PLC2701
ELMWebsiteCrawler,
ELMLinkScorer,
)
from elm.web.utilities import filter_documents
from compass.web.search import search_single_jurisdiction
from compass.extraction import check_for_relevant_text, extract_date
from compass.services.threaded import TempFileCache, TempFileCachePB
from compass.validation.location import (
DTreeJurisdictionValidator,
JurisdictionValidator,
JurisdictionWebsiteValidator,
)
from compass.web.file_loader import (
COMPASSWebFileLoader,
COMPASSLocalFileLoader,
)
from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer
from compass.web.url_utils import sanitize_url
from compass.utilities.enums import LLMTasks, COMPASSDocumentCollectionStep
from compass.utilities.parsing import is_pdf_doc
from compass.pb import COMPASS_PB
logger = logging.getLogger(__name__)
_NEG_INF = -1 * float("infinity")
_COLLECTION_SCORE_KEY = "collection_step_rank"
[docs]
async def download_known_urls(
jurisdiction, urls, browser_semaphore=None, file_loader_kwargs=None
):
"""Download documents from known URLs
Parameters
----------
jurisdiction : Jurisdiction
Jurisdiction instance representing the jurisdiction
corresponding to the documents.
urls : iterable of str
Collection of URLs to download documents from.
browser_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
downloads happening concurrently. If ``None``, no limits
are applied. By default, ``None``.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncWebFileLoader`.
By default, ``None``.
Returns
-------
out_docs : list
List of BaseDocument instances containing documents from the
URL's, or an empty list if something went wrong during the
retrieval process.
Notes
-----
Requires :class:`~compass.services.threaded.TempFileCachePB`
service to be running.
"""
COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name,
description="Downloading known URL(s)...",
)
file_loader_kwargs = file_loader_kwargs or {}
file_loader_kwargs.update({"file_cache_coroutine": TempFileCachePB.call})
logger.trace(
"kwargs for COMPASSWebFileLoader:\n%s",
pprint.PrettyPrinter().pformat(file_loader_kwargs),
)
file_loader = COMPASSWebFileLoader(
browser_semaphore=browser_semaphore, **file_loader_kwargs
)
async with COMPASS_PB.file_download_prog_bar(
jurisdiction.full_name, len(urls)
):
try:
out_docs = await load_docs(urls, file_loader)
except KeyboardInterrupt:
raise
except Exception as e:
msg = (
"Encountered error of type %r while downloading known URLs: %r"
)
err_type = type(e)
logger.exception(msg, err_type, urls)
out_docs = []
return out_docs
[docs]
async def load_known_docs(jurisdiction, fps, local_file_loader_kwargs=None):
"""Load documents from known local paths
Parameters
----------
jurisdiction : Jurisdiction
Jurisdiction instance representing the jurisdiction
corresponding to the documents.
fps : iterable of path-like
Collection of paths to load documents from.
local_file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`~elm.web.file_loader.AsyncLocalFileLoader` (for "elm"
file loader backend) or
:class:`~compass.web.file_loader.AsyncLocalDoclingFileLoader`
(for "docling" file loader backend). By default, ``None``.
Returns
-------
out_docs : list
List of BaseDocument instances containing documents from the
paths, or an empty list if something went wrong during the
retrieval process.
Notes
-----
Requires :class:`~compass.services.threaded.TempFileCachePB`
service to be running.
"""
COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name, description="Loading known document(s)..."
)
local_file_loader_kwargs = local_file_loader_kwargs or {}
local_file_loader_kwargs.update(
{"file_cache_coroutine": TempFileCachePB.call}
)
logger.trace(
"kwargs for COMPASSLocalFileLoader:\n%s",
pprint.PrettyPrinter().pformat(local_file_loader_kwargs),
)
fl = COMPASSLocalFileLoader(**local_file_loader_kwargs)
async with COMPASS_PB.file_download_prog_bar(
jurisdiction.full_name, len(fps)
):
try:
out_docs = await load_docs(fps, fl)
except KeyboardInterrupt:
raise
except Exception as e:
msg = (
"Encountered error of type %r while loading known documents: "
"%r"
)
err_type = type(e)
logger.exception(msg, err_type, fps)
out_docs = []
return out_docs
[docs]
async def find_jurisdiction_website(
jurisdiction,
model_configs,
file_loader_kwargs=None,
search_semaphore=None,
browser_semaphore=None,
usage_tracker=None,
url_ignore_substrings=None,
validate=True,
**kwargs,
):
"""Search for the main landing page of a given jurisdiction
This function submits two pre-determined queries based on the
jurisdiction name, prioritizing official landing pages. Additional
``kwargs`` (for example, alternate search engines) can be supplied
to fine-tune behavior.
Parameters
----------
jurisdiction : Jurisdiction
Jurisdiction instance representing the jurisdiction to find the
main webpage for.
model_configs : dict
Dictionary of :class:`~compass.llm.config.LLMConfig` instances.
Should have at minium a "default" key that is used as a fallback
for all tasks.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
search_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
playwright browsers used to submit search engine queries open
concurrently. If ``None``, no limits are applied.
By default, ``None``.
browser_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
playwright browsers open concurrently. If ``None``, no limits
are applied. By default, ``None``.
usage_tracker : UsageTracker, optional
Optional tracker instance to monitor token usage during
LLM calls. By default, ``None``.
url_ignore_substrings : list of str, optional
URL substrings that should be excluded from search results.
Substrings are applied case-insensitively. By default, ``None``.
validate : bool, default=True
If ``True``, each potential jurisdiction website will be checked
for validity using the
:class:`~compass.validation.location.JurisdictionWebsiteValidator`
before being returned. If ``False``, the first potential website
will be returned without validation. By default, ``True``.
**kwargs
Additional arguments forwarded to
:func:`elm.web.search.run.search_with_fallback`.
Returns
-------
str or None
URL for the jurisdiction website, if found; ``None`` otherwise.
"""
kwargs.update(file_loader_kwargs or {})
name = jurisdiction.full_name_the_prefixed
name_no_the = name.removeprefix("the ")
query_1 = f"{name_no_the} website".casefold().replace(",", "")
query_2 = f"main website {name}".casefold().replace(",", "")
potential_website_links = await search_with_fallback(
queries=[query_1, query_2],
num_urls=3,
ignore_url_parts=url_ignore_substrings,
browser_semaphore=search_semaphore,
task_name=jurisdiction.full_name,
**kwargs,
)
if not potential_website_links:
return None
if not validate:
return potential_website_links.pop()
model_config = model_configs.get(
LLMTasks.JURISDICTION_MAIN_WEBSITE_VALIDATION,
model_configs[LLMTasks.DEFAULT],
)
validator = JurisdictionWebsiteValidator(
browser_semaphore=browser_semaphore,
file_loader_kwargs=file_loader_kwargs,
usage_tracker=usage_tracker,
llm_service=model_config.llm_service,
**model_config.llm_call_kwargs,
)
for url in potential_website_links:
if await validator.check(url, jurisdiction):
return url
return None
[docs]
async def download_jurisdiction_ordinances_from_website(
website,
heuristic,
keyword_points,
file_loader_kwargs=None,
browser_config_kwargs=None,
crawler_config_kwargs=None,
max_urls=100,
crawl_semaphore=None,
pb_jurisdiction_name=None,
return_c4ai_results=False,
):
"""Download ordinance documents from a jurisdiction website
Parameters
----------
website : str
URL of the jurisdiction website to search.
heuristic : callable
Callable taking an BaseDocument and returning ``True`` when the
document should be kept.
keyword_points : dict
Dictionary of keyword points to use for scoring links.
Keys are keywords, values are points to assign to links
containing the keyword. If a link contains multiple keywords,
the points are summed up.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
browser_config_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize the
``crawl4ai.async_configs.BrowserConfig`` class used for the
web crawl. By default, ``None``.
crawler_config_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize the
``crawl4ai.async_configs.CrawlerConfig`` class used for the
web crawl. By default, ``None``.
max_urls : int, optional
Max number of URLs to check from the website before terminating
the search. By default, ``100``.
crawl_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
website searches happening concurrently. If ``None``, no limits
are applied. By default, ``None``.
pb_jurisdiction_name : str, optional
Optional jurisdiction name to use to update progress bar, if
it's being used. By default, ``None``.
return_c4ai_results : bool, default=False
If ``True``, the crawl4ai results will be returned as a second
return value. This is useful for debugging and examining the
crawled URLs. If ``False``, only the documents will be returned.
By default, ``False``.
Returns
-------
out_docs : list
List of BaseDocument instances containing potential ordinance
information, or an empty list if no ordinance document was
found.
results : list, optional
List of crawl4ai results containing metadata about the crawled
pages. Only returned when ``return_c4ai_results`` evaluates to
``True``.
Notes
-----
Requires :class:`~compass.services.threaded.TempFileCache` service
to be running.
"""
if crawl_semaphore is None:
crawl_semaphore = AsyncExitStack()
async def _doc_heuristic(doc): # noqa: RUF029
"""Heuristic check for wind ordinance documents"""
is_valid_document = heuristic.check(doc.text.lower())
if is_valid_document and pb_jurisdiction_name:
COMPASS_PB.update_website_crawl_doc_found(pb_jurisdiction_name)
return is_valid_document
async def _crawl_hook(*__, **___): # noqa: RUF029
"""Update progress bar as pages are searched"""
COMPASS_PB.update_website_crawl_task(pb_jurisdiction_name, advance=1)
flk = {"verify_ssl": False}
flk.update(file_loader_kwargs or {})
flk.update({"file_cache_coroutine": TempFileCache.call})
browser_config_kwargs = browser_config_kwargs or {}
pw_launch_kwargs = flk.get("pw_launch_kwargs", {})
browser_config_kwargs["headless"] = pw_launch_kwargs.get("headless", True)
logger.trace(
"kwargs for COMPASSWebFileLoader:\n%s",
pprint.PrettyPrinter().pformat(flk),
)
afl = COMPASSWebFileLoader(**flk)
crawler = ELMWebsiteCrawler(
validator=_doc_heuristic,
async_file_loader=afl,
url_scorer=ELMLinkScorer(keyword_points).score,
browser_config_kwargs=browser_config_kwargs,
crawler_config_kwargs=crawler_config_kwargs,
include_external=True,
max_pages=max_urls,
page_limit=int(max_urls * 3),
)
if pb_jurisdiction_name:
COMPASS_PB.update_jurisdiction_task(
pb_jurisdiction_name,
description=f"Searching for documents from {website} ...",
)
cpb = COMPASS_PB.website_crawl_prog_bar(pb_jurisdiction_name, max_urls)
ch = _crawl_hook
else:
cpb = AsyncExitStack()
ch = None
async with crawl_semaphore, cpb:
docs_or_pair = await crawler.run(
website,
on_result_hook=ch,
return_c4ai_results=return_c4ai_results,
)
if return_c4ai_results:
docs, c4ai_results = docs_or_pair
_sanitize_doc_sources(docs)
return docs, c4ai_results
_sanitize_doc_sources(docs_or_pair)
return docs_or_pair
[docs]
async def download_jurisdiction_ordinances_from_website_compass_crawl(
website,
heuristic,
keyword_points,
file_loader_kwargs=None,
already_visited=None,
num_link_scores_to_check_per_page=4,
max_urls=100,
crawl_semaphore=None,
pb_jurisdiction_name=None,
):
"""Download ord documents from a website using the COMPASS crawler
The COMPASS crawler is much more simplistic than the Crawl4AI
crawler, but is designed to access some links that Crawl4AI cannot
(such as those behind a button interface).
Parameters
----------
website : str
URL of the jurisdiction website to search.
heuristic : callable
Callable taking an BaseDocument and returning ``True`` when the
document should be kept.
keyword_points : dict
Dictionary of keyword points to use for scoring links.
Keys are keywords, values are points to assign to links
containing the keyword. If a link contains multiple keywords,
the points are summed up.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
already_visited : set of str, optional
URLs that have already been crawled and should be skipped.
By default, ``None``.
num_link_scores_to_check_per_page : int, default=4
Number of top-scoring links to visit per page.
By default, ``4``.
max_urls : int, default=100
Max number of URLs to check from the website before terminating
the search. By default, ``100``.
crawl_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
website crawls happening concurrently. If ``None``, no limits
are applied. By default, ``None``.
pb_jurisdiction_name : str, optional
Optional jurisdiction name to use to update progress bar, if
it's being used. By default, ``None``.
Returns
-------
out_docs : list
List of BaseDocument instances containing potential ordinance
information, or an empty list if no ordinance document was
found.
Notes
-----
Requires :class:`~compass.services.threaded.TempFileCache` service
to be running.
"""
if crawl_semaphore is None:
crawl_semaphore = AsyncExitStack()
async def _doc_heuristic(doc): # noqa: RUF029
"""Heuristic check for wind ordinance documents"""
is_valid_document = heuristic.check(doc.text.lower())
if is_valid_document and pb_jurisdiction_name:
COMPASS_PB.update_compass_website_crawl_doc_found(
pb_jurisdiction_name
)
return is_valid_document
async def _crawl_hook(*__, **___): # noqa: RUF029
"""Update progress bar as pages are searched"""
COMPASS_PB.update_compass_website_crawl_task(
pb_jurisdiction_name, advance=1
)
file_loader_kwargs = file_loader_kwargs or {}
file_loader_kwargs.update({"file_cache_coroutine": TempFileCache.call})
crawler = COMPASSCrawler(
validator=_doc_heuristic,
url_scorer=COMPASSLinkScorer(keyword_points).score,
file_loader_kwargs=file_loader_kwargs,
num_link_scores_to_check_per_page=num_link_scores_to_check_per_page,
already_visited=already_visited,
max_pages=max_urls,
)
if pb_jurisdiction_name:
COMPASS_PB.update_jurisdiction_task(
pb_jurisdiction_name,
description=f"Double-checking {website} for documents ...",
)
cpb = COMPASS_PB.compass_website_crawl_prog_bar(
pb_jurisdiction_name, max_urls
)
ch = _crawl_hook
else:
cpb = AsyncExitStack()
ch = None
async with crawl_semaphore, cpb:
return await crawler.run(website, on_new_page_visit_hook=ch)
[docs]
async def download_jurisdiction_ordinance_using_search_engine(
query_templates,
jurisdiction,
num_urls=5,
simple_se_result_sort=True,
file_loader_kwargs=None,
search_semaphore=None,
browser_semaphore=None,
url_ignore_substrings=None,
**kwargs,
):
"""Download the ordinance document(s) for a single jurisdiction
Parameters
----------
query_templates : sequence of str
Query templates that will be formatted with the jurisdiction
name before submission to the search engine.
jurisdiction : Jurisdiction
Location objects representing the jurisdiction.
num_urls : int, optional
Number of unique Google search result URL's to check for
ordinance document. By default, ``5``.
simple_se_result_sort : bool, optional
Flag indicating whether to use a simple top-n sort from the
first search engine that gives results (``True``) or to apply a
holistic link sorting based on all results from all search
engines (``False``). By default, ``True``.
file_loader_kwargs : dict, optional
Dictionary of keyword-argument pairs to initialize
:class:`elm.web.file_loader.AsyncWebFileLoader` with. If found,
the "pw_launch_kwargs" key in these will also be used to
initialize the
:class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the google URL search. By default, ``None``.
search_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
playwright browsers used to submit search engine queries open
concurrently. If this input is ``None``, the input from
`browser_semaphore` will be used in its place (i.e. the searches
and file downloads will be limited using the same semaphore).
By default, ``None``.
browser_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
playwright browsers used to download content from the web open
concurrently. If ``None``, no limits are applied.
By default, ``None``.
url_ignore_substrings : list of str, optional
URL substrings that should be excluded from search results.
Substrings are applied case-insensitively. By default, ``None``.
**kwargs
Additional keyword arguments forwarded to
:func:`elm.web.search.run.web_search_links_as_docs`. Common
entries include ``usage_tracker`` for logging LLM usage and
extra Playwright configuration.
Returns
-------
list or None
List of BaseDocument instances possibly containing ordinance
information, or ``None`` if no ordinance document was found.
Notes
-----
Requires :class:`~compass.services.threaded.TempFileCachePB`
service to be running.
"""
COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name, description="Searching web..."
)
kwargs.update(file_loader_kwargs or {})
kwargs.update({"file_cache_coroutine": TempFileCachePB.call})
try:
docs = await _docs_from_web_search(
query_templates,
num_urls=num_urls,
search_semaphore=search_semaphore,
browser_semaphore=browser_semaphore,
ignore_url_parts=url_ignore_substrings,
jurisdiction=jurisdiction,
simple_se_result_sort=simple_se_result_sort,
**kwargs,
)
except KeyboardInterrupt:
raise
except Exception as e:
msg = (
"Encountered error of type %r while searching web for docs for %s:"
)
err_type = type(e)
logger.exception(msg, err_type, jurisdiction.full_name)
docs = []
return docs
[docs]
async def filter_ordinance_docs(
docs,
jurisdiction,
model_configs,
heuristic,
tech,
text_collectors,
usage_tracker=None,
):
"""Filter a list of documents to only those that contain ordinances
Parameters
----------
docs : sequence of BaseDocument
Documents to screen for ordinance content.
jurisdiction : Jurisdiction
Location objects representing the jurisdiction.
model_configs : dict
Dictionary of LLMConfig instances. Should have at minium a
"default" key that is used as a fallback for all tasks.
heuristic : object
Domain-specific heuristic implementing a ``check`` method to
qualify ordinance content.
tech : str
Technology of interest (e.g. "solar", "wind", etc). This is
used to set up some document validation decision trees.
text_collectors : iterable
Iterable of text collector classes to run during document
parsing. Each class must implement the
:class:`compass.plugin.interface.BaseTextCollector` interface.
If the document already contains text collected by a given
collector (i.e. the collector's ``OUT_LABEL`` is found in
``doc.attrs``), that collector will be skipped.
usage_tracker : UsageTracker, optional
Optional tracker instance to monitor token usage during
LLM calls. By default, ``None``.
Returns
-------
list or None
List of BaseDocument instances possibly containing ordinance
information, or ``None`` if no ordinance document was found.
Notes
-----
The function updates CLI progress bars to reflect each filtering
phase and returns documents sorted by quality heuristics.
"""
logger.info(
"%d document(s) passed in to COMPASS filter for %s\n\t- %s",
len(docs),
jurisdiction.full_name,
"\n\t- ".join(
[doc.attrs.get("source", "Unknown source") for doc in docs]
),
)
COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name,
description="Checking files for correct jurisdiction...",
)
docs = await _down_select_docs_correct_jurisdiction(
docs,
jurisdiction=jurisdiction,
usage_tracker=usage_tracker,
model_config=model_configs.get(
LLMTasks.DOCUMENT_JURISDICTION_VALIDATION,
model_configs[LLMTasks.DEFAULT],
),
)
logger.info(
"%d document(s) remaining after jurisdiction filter for %s\n\t- %s",
len(docs),
jurisdiction.full_name,
"\n\t- ".join(
[doc.attrs.get("source", "Unknown source") for doc in docs]
),
)
COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name, description="Checking files for legal text..."
)
docs = await filter_documents(
docs,
validation_coroutine=_contains_relevant_text,
task_name=jurisdiction.full_name,
model_configs=model_configs,
heuristic=heuristic,
tech=tech,
text_collectors=text_collectors,
usage_tracker=usage_tracker,
)
if not docs:
logger.info(
"Did not find any potential ordinance documents for %s",
jurisdiction.full_name,
)
return docs
docs = _sort_final_ord_docs(docs)
logger.info(
"Found %d potential ordinance document(s) for %s\n\t- %s",
len(docs),
jurisdiction.full_name,
"\n\t- ".join([str(doc) for doc in docs]),
)
return docs
async def _docs_from_web_search(
query_templates,
num_urls,
search_semaphore,
browser_semaphore,
ignore_url_parts,
jurisdiction,
simple_se_result_sort,
**kwargs,
):
"""Retrieve top ``N`` search results as document instances"""
out = await search_single_jurisdiction(
query_templates,
jurisdiction,
num_urls,
search_semaphore,
ignore_url_parts,
simple=simple_se_result_sort,
**kwargs,
)
ranked_results = {
res.get("url"): res.get("overall_rank") or 1
for res in out["results"]
if res.get("filtered_reason") is None and res.get("url") is not None
}
urls = sorted(ranked_results, key=ranked_results.get)
if not urls:
return []
docs = await _docs_from_urls(
urls, jurisdiction.full_name, browser_semaphore, **kwargs
)
for doc in docs:
doc.attrs[_COLLECTION_SCORE_KEY] = ranked_results.get(
doc.attrs.get("source")
)
return docs
async def _docs_from_urls(
urls, jurisdiction_full_name, browser_semaphore, **kwargs
):
"""Load documents from a list of URLs using AsyncWebFileLoader"""
logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls))
logger.trace(
"kwargs for COMPASSWebFileLoader:\n%s",
pprint.PrettyPrinter().pformat(kwargs),
)
file_loader = COMPASSWebFileLoader(
browser_semaphore=browser_semaphore, **kwargs
)
COMPASS_PB.update_jurisdiction_task(
jurisdiction_full_name, description="Downloading files..."
)
async with COMPASS_PB.file_download_prog_bar(
jurisdiction_full_name, len(urls)
):
return await load_docs(urls, file_loader)
async def _down_select_docs_correct_jurisdiction(
docs, jurisdiction, usage_tracker, model_config
):
"""Remove documents that do not match the target jurisdiction"""
exempt_docs, docs_to_check = [], []
for doc in docs:
if doc.attrs.get("check_correct_jurisdiction", True):
docs_to_check.append(doc)
else:
exempt_docs.append(doc)
if not docs_to_check:
return exempt_docs
jurisdiction_validator = JurisdictionValidator(
text_splitter=model_config.text_splitter,
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
**model_config.llm_call_kwargs,
)
logger.debug("Validating documents for %r", jurisdiction)
checked_docs = await filter_documents(
docs_to_check,
validation_coroutine=jurisdiction_validator.check,
jurisdiction=jurisdiction,
task_name=jurisdiction.full_name,
)
return exempt_docs + checked_docs
async def _contains_relevant_text(
doc, model_configs, usage_tracker=None, **kwargs
):
"""Determine whether a document contains ordinance information"""
model_config = model_configs.get(
LLMTasks.DOCUMENT_CONTENT_VALIDATION,
model_configs[LLMTasks.DEFAULT],
)
logger.debug(
"Checking doc for ordinance info (source: %r)...",
doc.attrs.get("source", "unknown"),
)
found_text = await check_for_relevant_text(
doc,
model_config=model_config,
usage_tracker=usage_tracker,
**kwargs,
)
if found_text:
logger.debug("Detected relevant text; parsing date...")
date_model_config = model_configs.get(
LLMTasks.DATE_EXTRACTION, model_configs[LLMTasks.DEFAULT]
)
doc = await extract_date(
doc, date_model_config, usage_tracker=usage_tracker
)
return found_text
def _sanitize_doc_sources(docs):
"""Rewrite source attrs on documents returned by ELMWebsiteCrawler
crawl4ai can surface PDF URLs containing raw spaces (e.g. filenames
like "Land Use Code.pdf"). These fail when the file loader issues
an HTTP request because spaces are invalid in a URL path. This
function percent-encodes each document's ``source`` attribute
in-place so that all downstream consumers receive a valid URL.
"""
for doc in docs:
source = doc.attrs.get("source")
if source and " " in source:
doc.attrs["source"] = sanitize_url(source)
def _sort_final_ord_docs(all_ord_docs):
"""Sort ordinance documents by desirability heuristics"""
if not all_ord_docs:
return None
return sorted(all_ord_docs, key=_ord_doc_sorting_key, reverse=True)
def _ord_doc_sorting_key(doc):
"""Compute a composite sorting score for ordinance documents
Documents with larger scores will be prioritized.
"""
from_steps = doc.attrs.get("from_steps") or []
num_collection_steps_found_doc = len(from_steps)
best_step = _best_step(from_steps)
most_confident_collection = -(doc.attrs.get(_COLLECTION_SCORE_KEY) or 0)
no_date = (_NEG_INF, _NEG_INF, _NEG_INF)
latest_year, latest_month, latest_day = doc.attrs.get("date") or no_date
best_docs_from_website = doc.attrs.get(_SCORE_KEY, 0)
prefer_pdf_files = is_pdf_doc(doc)
highest_jurisdiction_score = doc.attrs.get(
# If not present, URL check passed with confidence so we set
# score to 1
DTreeJurisdictionValidator.META_SCORE_KEY,
1,
)
shortest_text_length = -1 * len(doc.text)
return (
num_collection_steps_found_doc,
best_step,
most_confident_collection,
best_docs_from_website,
latest_year or _NEG_INF,
prefer_pdf_files,
highest_jurisdiction_score,
shortest_text_length,
latest_month or _NEG_INF,
latest_day or _NEG_INF,
)
def _best_step(from_steps):
"""Get the best step that led to finding a document"""
if not from_steps:
return 0
return max(
COMPASSDocumentCollectionStep(step).priority for step in from_steps
)