Close Menu
    Facebook X (Twitter) Instagram
    • Privacy Policy
    • Terms Of Service
    • Social Media Disclaimer
    • DMCA Compliance
    • Anti-Spam Policy
    Facebook X (Twitter) Instagram
    Crypto Love You
    • Home
    • Crypto News
      • Bitcoin
      • Ethereum
      • Altcoins
      • Blockchain
      • DeFi
    • AI News
    • Stock News
    • Learn
      • AI for Beginners
      • AI Tips
      • Make Money with AI
    • Reviews
    • Tools
      • Best AI Tools
      • Crypto Market Cap List
      • Stock Market Overview
      • Market Heatmap
    • Contact
    Crypto Love You
    Home»AI News»How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End
    How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End
    AI News

    How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End

    January 27, 20264 Mins Read
    Share
    Facebook Twitter LinkedIn Pinterest Email
    binance


    @tool
    def sql_investigate(query: str) -> dict:
    try:
    df = con.execute(query).df()
    head = df.head(30)
    return {
    “rows”: int(len(df)),
    “columns”: list(df.columns),
    “preview”: head.to_dict(orient=”records”)
    }
    except Exception as e:
    return {“error”: str(e)}

    @tool
    def log_pattern_scan(window_start_iso: str, window_end_iso: str, top_k: int = 8) -> dict:
    ws = pd.to_datetime(window_start_iso)
    we = pd.to_datetime(window_end_iso)
    df = logs_df[(logs_df[“ts”] >= ws) & (logs_df[“ts”] <= we)].copy()
    if df.empty:
    return {“rows”: 0, “top_error_kinds”: [], “top_services”: [], “top_endpoints”: []}
    df[“error_kind_norm”] = df[“error_kind”].fillna(“”).replace(“”, “NONE”)
    err = df[df[“level”].isin([“WARN”,”ERROR”])].copy()
    top_err = err[“error_kind_norm”].value_counts().head(int(top_k)).to_dict()
    top_svc = err[“service”].value_counts().head(int(top_k)).to_dict()
    top_ep = err[“endpoint”].value_counts().head(int(top_k)).to_dict()
    by_region = err.groupby(“region”).size().sort_values(ascending=False).head(int(top_k)).to_dict()
    p95_latency = float(np.percentile(df[“latency_ms”].values, 95))
    return {
    “rows”: int(len(df)),
    “warn_error_rows”: int(len(err)),
    “p95_latency_ms”: p95_latency,
    “top_error_kinds”: top_err,
    “top_services”: top_svc,
    “top_endpoints”: top_ep,
    “error_by_region”: by_region
    }

    @tool
    def propose_mitigations(hypothesis: str) -> dict:
    h = hypothesis.lower()
    mitigations = []
    if “conn” in h or “pool” in h or “db” in h:
    mitigations += [
    {“action”: “Increase DB connection pool size (bounded) and add backpressure at db-proxy”, “owner”: “Platform”, “eta_days”: 3},
    {“action”: “Add circuit breaker + adaptive timeouts between api-gateway and db-proxy”, “owner”: “Backend”, “eta_days”: 5},
    {“action”: “Tune query hotspots; add indexes for top offending endpoints”, “owner”: “Data/DBA”, “eta_days”: 7},
    ]
    if “timeout” in h or “upstream” in h:
    mitigations += [
    {“action”: “Implement hedged requests for idempotent calls (carefully) and tighten retry budgets”, “owner”: “Backend”, “eta_days”: 6},
    {“action”: “Add upstream SLO-aware load shedding at api-gateway”, “owner”: “Platform”, “eta_days”: 7},
    ]
    if “cache” in h:
    mitigations += [
    {“action”: “Add request coalescing and negative caching to prevent cache-miss storms”, “owner”: “Backend”, “eta_days”: 6},
    {“action”: “Prewarm cache for top endpoints during deploys”, “owner”: “SRE”, “eta_days”: 4},
    ]
    if not mitigations:
    mitigations += [
    {“action”: “Add targeted dashboards and alerts for the suspected bottleneck metric”, “owner”: “SRE”, “eta_days”: 3},
    {“action”: “Run controlled load test to reproduce and validate the hypothesis”, “owner”: “Perf Eng”, “eta_days”: 5},
    ]
    mitigations = mitigations[:10]
    return {“hypothesis”: hypothesis, “mitigations”: mitigations}

    @tool
    def draft_postmortem(title: str, window_start_iso: str, window_end_iso: str, customer_impact: str, suspected_root_cause: str, key_facts_json: str, mitigations_json: str) -> dict:
    try:
    facts = json.loads(key_facts_json)
    except Exception:
    facts = {“note”: “key_facts_json was not valid JSON”}
    try:
    mits = json.loads(mitigations_json)
    except Exception:
    mits = {“note”: “mitigations_json was not valid JSON”}
    doc = {
    “title”: title,
    “date_utc”: datetime.utcnow().strftime(“%Y-%m-%d”),
    “incident_window_utc”: {“start”: window_start_iso, “end”: window_end_iso},
    “customer_impact”: customer_impact,
    “suspected_root_cause”: suspected_root_cause,
    “detection”: {
    “how_detected”: “Automated anomaly detection + error-rate spike triage”,
    “gaps”: [“Add earlier saturation alerting”, “Improve symptom-to-cause correlation dashboards”]
    },
    “timeline”: [
    {“t”: window_start_iso, “event”: “Symptoms begin (latency/error anomalies)”},
    {“t”: “T+10m”, “event”: “On-call begins triage; identifies top services/endpoints”},
    {“t”: “T+25m”, “event”: “Mitigation actions initiated (throttling/backpressure)”},
    {“t”: window_end_iso, “event”: “Customer impact ends; metrics stabilize”},
    ],
    “key_facts”: facts,
    “corrective_actions”: mits.get(“mitigations”, mits),
    “followups”: [
    {“area”: “Reliability”, “task”: “Add saturation signals + budget-based retries”, “priority”: “P1”},
    {“area”: “Observability”, “task”: “Add golden signals per service/endpoint”, “priority”: “P1”},
    {“area”: “Performance”, “task”: “Reproduce with load test and validate fix”, “priority”: “P2”},
    ],
    “appendix”: {“notes”: “Generated by a Haystack multi-agent workflow (non-RAG).”}
    }
    return {“postmortem_json”: doc}

    ledger

    llm = OpenAIChatGenerator(model=”gpt-4o-mini”)

    state_schema = {
    “metrics_csv_path”: {“type”: str},
    “logs_csv_path”: {“type”: str},
    “metrics_summary”: {“type”: dict},
    “logs_summary”: {“type”: dict},
    “incident_window”: {“type”: dict},
    “investigation_notes”: {“type”: list, “handler”: merge_lists},
    “hypothesis”: {“type”: str},
    “key_facts”: {“type”: dict},
    “mitigation_plan”: {“type”: dict},
    “postmortem”: {“type”: dict},
    }

    profiler_prompt = “””You are a specialist incident profiler.
    Goal: turn raw metrics/log summaries into crisp, high-signal findings.
    Rules:
    – Prefer calling tools over guessing.
    – Output must be a JSON object with keys: window, symptoms, top_contributors, hypothesis, key_facts.
    – Hypothesis must be falsifiable and mention at least one specific service and mechanism.
    “””

    writer_prompt = “””You are a specialist postmortem writer.
    Goal: produce a high-quality postmortem JSON (not prose) using the provided evidence and mitigation plan.
    Rules:
    – Call tools only if needed.
    – Keep ‘suspected_root_cause’ specific and not generic.
    – Ensure corrective actions have owners and eta_days.
    “””

    coordinator_prompt = “””You are an incident commander coordinating a non-RAG multi-agent workflow.
    You must:
    1) Load inputs
    2) Find an incident window (use p95_ms or error_rate)
    3) Investigate with targeted SQL and log pattern scan
    4) Ask the specialist profiler to synthesize evidence
    5) Propose mitigations
    6) Ask the specialist writer to draft a postmortem JSON
    Return a final response with:
    – A short executive summary (max 10 lines)
    – The postmortem JSON
    – A compact runbook checklist (bulleted)
    “””

    profiler_agent = Agent(
    chat_generator=llm,
    tools=[load_inputs, detect_incident_window, sql_investigate, log_pattern_scan],
    system_prompt=profiler_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )

    writer_agent = Agent(
    chat_generator=llm,
    tools=[draft_postmortem],
    system_prompt=writer_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )

    profiler_tool = ComponentTool(
    component=profiler_agent,
    name=”profiler_specialist”,
    description=”Synthesizes incident evidence into a falsifiable hypothesis and key facts (JSON output).”,
    outputs_to_string={“source”: “last_message”}
    )

    writer_tool = ComponentTool(
    component=writer_agent,
    name=”postmortem_writer_specialist”,
    description=”Drafts a postmortem JSON using title/window/impact/rca/facts/mitigations.”,
    outputs_to_string={“source”: “last_message”}
    )

    coordinator_agent = Agent(
    chat_generator=llm,
    tools=[
    load_inputs,
    detect_incident_window,
    sql_investigate,
    log_pattern_scan,
    propose_mitigations,
    profiler_tool,
    writer_tool,
    draft_postmortem
    ],
    system_prompt=coordinator_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )



    Source link

    aistudios
    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    CryptoExpert
    • Website

    Related Posts

    How multi-agent AI economics influence business automation

    March 13, 2026

    How to Design a Streaming Decision Agent with Partial Reasoning, Online Replanning, and Reactive Mid-Execution Adaptation in Dynamic Environments

    March 12, 2026

    AI Is Learning From the News. Now Publishers Want to Get Paid

    March 11, 2026

    Improving AI models’ ability to explain their predictions | MIT News

    March 10, 2026
    Add A Comment
    Leave A Reply Cancel Reply

    notion
    Latest Posts

    100% Free AI Course by Anthropic – Learn AI in 2026

    March 13, 2026

    ChatGPT vs Gemini: Make Roblox Hacks (IT ACTUALLY WORKS!)

    March 13, 2026

    Ripple to Buy Back $750M in Shares through April: Report

    March 13, 2026

    EigenCloud Challenge Reveals 5 AI Agents Using TEEs for Verifiable Trust

    March 13, 2026

    Vitalik Buterin Redefines Ethereum With Three Core Roles

    March 13, 2026
    synthesia
    LEGAL INFORMATION
    • Privacy Policy
    • Terms Of Service
    • Social Media Disclaimer
    • DMCA Compliance
    • Anti-Spam Policy
    Top Insights

    Bitcoin Following The 2022 Cycle? What To Expect If It Plays Out The Same Way

    March 13, 2026

    Why Every Blockchain Suddenly Wants Its Own Perp Dex

    March 13, 2026
    synthesia
    Facebook X (Twitter) Instagram Pinterest
    © 2026 CryptoLoveYou.com - All rights reserved.

    Type above and press Enter to search. Press Esc to cancel.