Source code for compass.scripts.search

"""Search-only orchestration for COMPASS

Runs the web-search portion of the COMPASS pipeline (no download,
filtering, validation, or extraction) and emits a JSON report of the
ranked URLs returned by each configured search engine for each
jurisdiction. The output is intended to help diagnose retrieval
quality before invoking the full pipeline.
"""

import asyncio
import json
import logging
from datetime import datetime, UTC
from pathlib import Path

from compass.web.search import search_single_jurisdiction
from compass.pipeline.runtime import PipelineRuntime
from compass.utilities.jurisdictions import (
    jurisdictions_from_df,
    load_jurisdictions_from_fp,
)


logger = logging.getLogger(__name__)






[docs] def write_search_report(report, out_path): """Write a search-only report as JSON Parameters ---------- report : dict Report returned by :func:`run_search`. out_path : path-like Destination file path. """ payload = json.dumps(report, indent=2, ensure_ascii=False) out_path = Path(out_path) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(payload, encoding="utf-8")
[docs] def summary(report): """Format search-only output as readable plain text Parameters ---------- report : dict Dictionary produced by :func:`run_search`. Returns ------- str Multi-line summary containing only records that were not filtered, sorted by ``overall_rank`` within each jurisdiction. """ lines = [] lines.extend( ( "COMPASS search-only summary", f"tech: {report.get('tech')}", f"timestamp: {report.get('timestamp')}", f"requested top urls: {report.get('num_urls_requested')}", "", ) ) jurisdictions = report.get("jurisdictions", []) for jur in jurisdictions: lines.append(f"jurisdiction: {jur.get('jurisdiction')}") if jur.get("error"): lines.extend((f" error: {jur.get('error')}", "")) continue kept = [ entry for entry in jur.get("results", []) if entry.get("filtered_reason") is None ] kept.sort( key=lambda entry: ( entry.get("overall_rank") if entry.get("overall_rank") is not None else float("inf"), entry.get("query_rank") if entry.get("query_rank") is not None else float("inf"), ) ) if not kept: lines.extend((" no unfiltered results", "")) continue for entry in kept: lines.extend( ( ( " " f"[{entry.get('overall_rank')}] " f"{entry.get('search_engine')} " f"(query_rank={entry.get('query_rank')})" ), f" query: {entry.get('query')}", f" url: {entry.get('url')}", ) ) lines.append("") return "\n".join(lines).rstrip()