update

2026-03-25 22:03:39 +01:00
parent a0073b4fb1
commit faf4ca10c9
5603 changed files with 1030866 additions and 79 deletions
--- a/Kopie/services/news_llm_search.py
+++ b/Kopie/services/news_llm_search.py
@@ -0,0 +1,171 @@
+"""Live medical news search using OpenAI web search (like ChatGPT)."""
+from __future__ import annotations
+
+import json
+import os
+import re
+from dataclasses import dataclass, asdict
+from datetime import date
+from typing import Any
+
+from openai import OpenAI
+
+SPECIALTY_LABELS = {
+    "dermatology": "Dermatologie",
+    "general-medicine": "Allgemeinmedizin",
+    "internal-medicine": "Innere Medizin",
+    "gynecology": "Gynäkologie",
+    "anesthesiology": "Anästhesiologie",
+    "cardiology": "Kardiologie",
+    "oncology": "Onkologie",
+    "pediatrics": "Pädiatrie",
+    "neurology": "Neurologie",
+    "psychiatry": "Psychiatrie",
+    "surgery": "Chirurgie",
+    "ophthalmology": "Ophthalmologie",
+    "ent": "HNO",
+    "urology": "Urologie",
+    "orthopedics": "Orthopädie",
+    "radiology": "Radiologie",
+    "rheumatology": "Rheumatologie",
+    "endocrinology": "Endokrinologie",
+    "gastroenterology": "Gastroenterologie",
+    "pulmonology": "Pneumologie",
+    "nephrology": "Nephrologie",
+    "infectiology": "Infektiologie",
+    "emergency-medicine": "Notfallmedizin",
+    "pathology": "Pathologie",
+    "allergology": "Allergologie",
+}
+
+
+@dataclass(frozen=True)
+class NewsCandidate:
+    title: str
+    url: str
+    source: str
+    publishedAt: str
+    summary: str
+    tags: list[str]
+    language: str
+
+
+def _extract_json_block(text: str) -> dict:
+    cleaned = text.strip()
+    cleaned = re.sub(r"^```[a-zA-Z]*\s*", "", cleaned)
+    cleaned = re.sub(r"\s*```\s*$", "", cleaned)
+    cleaned = cleaned.strip()
+    try:
+        data = json.loads(cleaned)
+        if isinstance(data, dict):
+            return data
+    except Exception:
+        pass
+    match = re.search(r"\{[\s\S]*\}", cleaned)
+    if match:
+        try:
+            data = json.loads(match.group(0))
+            if isinstance(data, dict):
+                return data
+        except Exception:
+            pass
+    return {"news": []}
+
+
+def _build_news_prompt(
+    specialties: list[str],
+    limit: int,
+) -> str:
+    spec_labels = [SPECIALTY_LABELS.get(s, s) for s in specialties]
+    spec_text = ", ".join(spec_labels) if spec_labels else "Medizin allgemein"
+    today_str = date.today().isoformat()
+
+    return (
+        f"Suche im Internet nach den NEUESTEN und WICHTIGSTEN medizinischen News "
+        f"und Forschungsergebnissen. Heutiges Datum: {today_str}\n\n"
+        f"Fachgebiete: {spec_text}\n\n"
+        "Ich brauche aktuelle, relevante Nachrichten aus der Medizin:\n"
+        "- Neue Studien und Forschungsergebnisse\n"
+        "- Neue Therapien und Medikamente (Zulassungen, Phase-III-Ergebnisse)\n"
+        "- Leitlinien-Updates\n"
+        "- Wichtige Konferenz-Highlights und Abstracts\n"
+        "- Gesundheitspolitische Nachrichten\n"
+        "- Sicherheitswarnungen (FDA, EMA, Swissmedic)\n\n"
+        "Bevorzuge Quellen wie: NEJM, Lancet, JAMA, BMJ, Nature Medicine, "
+        "Deutsches Ärzteblatt, Swiss Medical Weekly, Medical Tribune, "
+        "PubMed, Medscape, aerzteblatt.de\n\n"
+        f"Liefere mindestens 10, maximal {limit} Ergebnisse.\n\n"
+        "WICHTIG: Antwort als REINES JSON, kein anderer Text.\n"
+        '{"news": [...]}\n'
+        "Felder pro News-Item:\n"
+        "title (Titel der Nachricht),\n"
+        "url (DIREKTER Link zum Artikel),\n"
+        "source (Name der Quelle, z.B. 'NEJM', 'Lancet'),\n"
+        "publishedAt (YYYY-MM-DD, Veröffentlichungsdatum),\n"
+        "summary (2-4 Sätze Zusammenfassung),\n"
+        'tags (Array der Fachgebiete, z.B. ["dermatology", "oncology"]),\n'
+        "language (Sprache des Originalartikels, z.B. 'en', 'de')"
+    )
+
+
+def _normalize_news(row: dict, default_tags: list[str]) -> NewsCandidate | None:
+    if not isinstance(row, dict):
+        return None
+    title = str(row.get("title") or "").strip()
+    url = str(row.get("url") or "").strip()
+    if not title:
+        return None
+    tags = row.get("tags") if isinstance(row.get("tags"), list) else list(default_tags)
+    return NewsCandidate(
+        title=title,
+        url=url,
+        source=str(row.get("source") or "").strip(),
+        publishedAt=str(row.get("publishedAt") or "").strip(),
+        summary=str(row.get("summary") or row.get("description") or "").strip()[:800],
+        tags=[str(t).strip().lower() for t in tags if str(t).strip()],
+        language=str(row.get("language") or "en").strip().lower(),
+    )
+
+
+def search_medical_news(
+    specialties: list[str],
+    limit: int = 30,
+) -> list[NewsCandidate]:
+    key = os.getenv("OPENAI_API_KEY", "").strip()
+    if not key:
+        raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
+
+    prompt = _build_news_prompt(specialties=specialties, limit=limit)
+    search_model = os.getenv("NEWS_SEARCH_MODEL", "gpt-4o-mini-search-preview").strip()
+
+    client = OpenAI(api_key=key, timeout=80)
+    resp = client.chat.completions.create(
+        model=search_model,
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "Du bist ein medizinischer Nachrichtenassistent. "
+                    "Suche im Internet nach den neuesten medizinischen Nachrichten "
+                    "und liefere die Ergebnisse als JSON. "
+                    "Gib NUR real existierende Artikel mit funktionierenden Links an."
+                ),
+            },
+            {"role": "user", "content": prompt},
+        ],
+    )
+    txt = ""
+    try:
+        txt = (resp.choices[0].message.content or "").strip()
+    except Exception:
+        txt = ""
+
+    payload = _extract_json_block(txt)
+    rows = payload.get("news") if isinstance(payload.get("news"), list) else []
+
+    out: list[NewsCandidate] = []
+    for row in rows:
+        cand = _normalize_news(row, default_tags=specialties)
+        if cand is not None:
+            out.append(cand)
+    return out