Files
aza/AzA march 2026/services/news_llm_search.py
2026-03-25 22:03:39 +01:00

172 lines
5.7 KiB
Python

"""Live medical news search using OpenAI web search (like ChatGPT)."""
from __future__ import annotations
import json
import os
import re
from dataclasses import dataclass, asdict
from datetime import date
from typing import Any
from openai import OpenAI
SPECIALTY_LABELS = {
"dermatology": "Dermatologie",
"general-medicine": "Allgemeinmedizin",
"internal-medicine": "Innere Medizin",
"gynecology": "Gynäkologie",
"anesthesiology": "Anästhesiologie",
"cardiology": "Kardiologie",
"oncology": "Onkologie",
"pediatrics": "Pädiatrie",
"neurology": "Neurologie",
"psychiatry": "Psychiatrie",
"surgery": "Chirurgie",
"ophthalmology": "Ophthalmologie",
"ent": "HNO",
"urology": "Urologie",
"orthopedics": "Orthopädie",
"radiology": "Radiologie",
"rheumatology": "Rheumatologie",
"endocrinology": "Endokrinologie",
"gastroenterology": "Gastroenterologie",
"pulmonology": "Pneumologie",
"nephrology": "Nephrologie",
"infectiology": "Infektiologie",
"emergency-medicine": "Notfallmedizin",
"pathology": "Pathologie",
"allergology": "Allergologie",
}
@dataclass(frozen=True)
class NewsCandidate:
title: str
url: str
source: str
publishedAt: str
summary: str
tags: list[str]
language: str
def _extract_json_block(text: str) -> dict:
cleaned = text.strip()
cleaned = re.sub(r"^```[a-zA-Z]*\s*", "", cleaned)
cleaned = re.sub(r"\s*```\s*$", "", cleaned)
cleaned = cleaned.strip()
try:
data = json.loads(cleaned)
if isinstance(data, dict):
return data
except Exception:
pass
match = re.search(r"\{[\s\S]*\}", cleaned)
if match:
try:
data = json.loads(match.group(0))
if isinstance(data, dict):
return data
except Exception:
pass
return {"news": []}
def _build_news_prompt(
specialties: list[str],
limit: int,
) -> str:
spec_labels = [SPECIALTY_LABELS.get(s, s) for s in specialties]
spec_text = ", ".join(spec_labels) if spec_labels else "Medizin allgemein"
today_str = date.today().isoformat()
return (
f"Suche im Internet nach den NEUESTEN und WICHTIGSTEN medizinischen News "
f"und Forschungsergebnissen. Heutiges Datum: {today_str}\n\n"
f"Fachgebiete: {spec_text}\n\n"
"Ich brauche aktuelle, relevante Nachrichten aus der Medizin:\n"
"- Neue Studien und Forschungsergebnisse\n"
"- Neue Therapien und Medikamente (Zulassungen, Phase-III-Ergebnisse)\n"
"- Leitlinien-Updates\n"
"- Wichtige Konferenz-Highlights und Abstracts\n"
"- Gesundheitspolitische Nachrichten\n"
"- Sicherheitswarnungen (FDA, EMA, Swissmedic)\n\n"
"Bevorzuge Quellen wie: NEJM, Lancet, JAMA, BMJ, Nature Medicine, "
"Deutsches Ärzteblatt, Swiss Medical Weekly, Medical Tribune, "
"PubMed, Medscape, aerzteblatt.de\n\n"
f"Liefere mindestens 10, maximal {limit} Ergebnisse.\n\n"
"WICHTIG: Antwort als REINES JSON, kein anderer Text.\n"
'{"news": [...]}\n'
"Felder pro News-Item:\n"
"title (Titel der Nachricht),\n"
"url (DIREKTER Link zum Artikel),\n"
"source (Name der Quelle, z.B. 'NEJM', 'Lancet'),\n"
"publishedAt (YYYY-MM-DD, Veröffentlichungsdatum),\n"
"summary (2-4 Sätze Zusammenfassung),\n"
'tags (Array der Fachgebiete, z.B. ["dermatology", "oncology"]),\n'
"language (Sprache des Originalartikels, z.B. 'en', 'de')"
)
def _normalize_news(row: dict, default_tags: list[str]) -> NewsCandidate | None:
if not isinstance(row, dict):
return None
title = str(row.get("title") or "").strip()
url = str(row.get("url") or "").strip()
if not title:
return None
tags = row.get("tags") if isinstance(row.get("tags"), list) else list(default_tags)
return NewsCandidate(
title=title,
url=url,
source=str(row.get("source") or "").strip(),
publishedAt=str(row.get("publishedAt") or "").strip(),
summary=str(row.get("summary") or row.get("description") or "").strip()[:800],
tags=[str(t).strip().lower() for t in tags if str(t).strip()],
language=str(row.get("language") or "en").strip().lower(),
)
def search_medical_news(
specialties: list[str],
limit: int = 30,
) -> list[NewsCandidate]:
key = os.getenv("OPENAI_API_KEY", "").strip()
if not key:
raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
prompt = _build_news_prompt(specialties=specialties, limit=limit)
search_model = os.getenv("NEWS_SEARCH_MODEL", "gpt-4o-mini-search-preview").strip()
client = OpenAI(api_key=key, timeout=80)
resp = client.chat.completions.create(
model=search_model,
messages=[
{
"role": "system",
"content": (
"Du bist ein medizinischer Nachrichtenassistent. "
"Suche im Internet nach den neuesten medizinischen Nachrichten "
"und liefere die Ergebnisse als JSON. "
"Gib NUR real existierende Artikel mit funktionierenden Links an."
),
},
{"role": "user", "content": prompt},
],
)
txt = ""
try:
txt = (resp.choices[0].message.content or "").strip()
except Exception:
txt = ""
payload = _extract_json_block(txt)
rows = payload.get("news") if isinstance(payload.get("news"), list) else []
out: list[NewsCandidate] = []
for row in rows:
cand = _normalize_news(row, default_tags=specialties)
if cand is not None:
out.append(cand)
return out