aza/AzA march 2026/services/news_llm_search.py

"""Live medical news search using OpenAI web search (like ChatGPT)."""
from __future__ import annotations

import json
import os
import re
from dataclasses import dataclass, asdict
from datetime import date
from typing import Any

from openai import OpenAI

SPECIALTY_LABELS = {
    "dermatology": "Dermatologie",
    "general-medicine": "Allgemeinmedizin",
    "internal-medicine": "Innere Medizin",
    "gynecology": "Gynäkologie",
    "anesthesiology": "Anästhesiologie",
    "cardiology": "Kardiologie",
    "oncology": "Onkologie",
    "pediatrics": "Pädiatrie",
    "neurology": "Neurologie",
    "psychiatry": "Psychiatrie",
    "surgery": "Chirurgie",
    "ophthalmology": "Ophthalmologie",
    "ent": "HNO",
    "urology": "Urologie",
    "orthopedics": "Orthopädie",
    "radiology": "Radiologie",
    "rheumatology": "Rheumatologie",
    "endocrinology": "Endokrinologie",
    "gastroenterology": "Gastroenterologie",
    "pulmonology": "Pneumologie",
    "nephrology": "Nephrologie",
    "infectiology": "Infektiologie",
    "emergency-medicine": "Notfallmedizin",
    "pathology": "Pathologie",
    "allergology": "Allergologie",
}


@dataclass(frozen=True)
class NewsCandidate:
    title: str
    url: str
    source: str
    publishedAt: str
    summary: str
    tags: list[str]
    language: str


def _extract_json_block(text: str) -> dict:
    cleaned = text.strip()
    cleaned = re.sub(r"^```[a-zA-Z]*\s*", "", cleaned)
    cleaned = re.sub(r"\s*```\s*$", "", cleaned)
    cleaned = cleaned.strip()
    try:
        data = json.loads(cleaned)
        if isinstance(data, dict):
            return data
    except Exception:
        pass
    match = re.search(r"\{[\s\S]*\}", cleaned)
    if match:
        try:
            data = json.loads(match.group(0))
            if isinstance(data, dict):
                return data
        except Exception:
            pass
    return {"news": []}


def _build_news_prompt(
    specialties: list[str],
    limit: int,
) -> str:
    spec_labels = [SPECIALTY_LABELS.get(s, s) for s in specialties]
    spec_text = ", ".join(spec_labels) if spec_labels else "Medizin allgemein"
    today_str = date.today().isoformat()

    return (
        f"Suche im Internet nach den NEUESTEN und WICHTIGSTEN medizinischen News "
        f"und Forschungsergebnissen. Heutiges Datum: {today_str}\n\n"
        f"Fachgebiete: {spec_text}\n\n"
        "Ich brauche aktuelle, relevante Nachrichten aus der Medizin:\n"
        "- Neue Studien und Forschungsergebnisse\n"
        "- Neue Therapien und Medikamente (Zulassungen, Phase-III-Ergebnisse)\n"
        "- Leitlinien-Updates\n"
        "- Wichtige Konferenz-Highlights und Abstracts\n"
        "- Gesundheitspolitische Nachrichten\n"
        "- Sicherheitswarnungen (FDA, EMA, Swissmedic)\n\n"
        "Bevorzuge Quellen wie: NEJM, Lancet, JAMA, BMJ, Nature Medicine, "
        "Deutsches Ärzteblatt, Swiss Medical Weekly, Medical Tribune, "
        "PubMed, Medscape, aerzteblatt.de\n\n"
        f"Liefere mindestens 10, maximal {limit} Ergebnisse.\n\n"
        "WICHTIG: Antwort als REINES JSON, kein anderer Text.\n"
        '{"news": [...]}\n'
        "Felder pro News-Item:\n"
        "title (Titel der Nachricht),\n"
        "url (DIREKTER Link zum Artikel),\n"
        "source (Name der Quelle, z.B. 'NEJM', 'Lancet'),\n"
        "publishedAt (YYYY-MM-DD, Veröffentlichungsdatum),\n"
        "summary (2-4 Sätze Zusammenfassung),\n"
        'tags (Array der Fachgebiete, z.B. ["dermatology", "oncology"]),\n'
        "language (Sprache des Originalartikels, z.B. 'en', 'de')"
    )


def _normalize_news(row: dict, default_tags: list[str]) -> NewsCandidate | None:
    if not isinstance(row, dict):
        return None
    title = str(row.get("title") or "").strip()
    url = str(row.get("url") or "").strip()
    if not title:
        return None
    tags = row.get("tags") if isinstance(row.get("tags"), list) else list(default_tags)
    return NewsCandidate(
        title=title,
        url=url,
        source=str(row.get("source") or "").strip(),
        publishedAt=str(row.get("publishedAt") or "").strip(),
        summary=str(row.get("summary") or row.get("description") or "").strip()[:800],
        tags=[str(t).strip().lower() for t in tags if str(t).strip()],
        language=str(row.get("language") or "en").strip().lower(),
    )


def search_medical_news(
    specialties: list[str],
    limit: int = 30,
) -> list[NewsCandidate]:
    key = os.getenv("OPENAI_API_KEY", "").strip()
    if not key:
        raise RuntimeError("OPENAI_API_KEY nicht gesetzt")

    prompt = _build_news_prompt(specialties=specialties, limit=limit)
    search_model = os.getenv("NEWS_SEARCH_MODEL", "gpt-4o-mini-search-preview").strip()

    client = OpenAI(api_key=key, timeout=80)
    resp = client.chat.completions.create(
        model=search_model,
        messages=[
            {
                "role": "system",
                "content": (
                    "Du bist ein medizinischer Nachrichtenassistent. "
                    "Suche im Internet nach den neuesten medizinischen Nachrichten "
                    "und liefere die Ergebnisse als JSON. "
                    "Gib NUR real existierende Artikel mit funktionierenden Links an."
                ),
            },
            {"role": "user", "content": prompt},
        ],
    )
    txt = ""
    try:
        txt = (resp.choices[0].message.content or "").strip()
    except Exception:
        txt = ""

    payload = _extract_json_block(txt)
    rows = payload.get("news") if isinstance(payload.get("news"), list) else []

    out: list[NewsCandidate] = []
    for row in rows:
        cand = _normalize_news(row, default_tags=specialties)
        if cand is not None:
            out.append(cand)
    return out