update

2026-04-22 22:33:46 +02:00
parent 7bf1e0dbb2
commit d4822fc8dc
5156 changed files with 829337 additions and 44 deletions
--- a/(18)/services/event_extract_llm.py
+++ b/(18)/services/event_extract_llm.py
@@ -0,0 +1,220 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from typing import Any
+
+from openai import OpenAI
+
+from services.live_event_search import SearchResult
+
+
+@dataclass(frozen=True)
+class EventCandidate:
+    name: str
+    startDate: str | None
+    endDate: str | None
+    city: str
+    country: str
+    urlCandidate: str
+    shortDescription: str
+    organizer: str
+    specialtyTags: list[str]
+    regionTags: list[str]
+    rationale: str
+    confidence: float
+
+
+def _clean_text(t: str) -> str:
+    t = re.sub(r"<[^>]+>", " ", t or "")
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
+
+def _safe_date(value: Any) -> str | None:
+    s = str(value or "").strip()
+    if not s:
+        return None
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
+        return s
+    return None
+
+
+def _extract_page_excerpt(url: str, max_chars: int = 14000) -> str:
+    u = (url or "").strip()
+    if not u:
+        return ""
+    try:
+        req = urllib.request.Request(
+            u,
+            headers={"User-Agent": "AZA-LiveEventSearch/1.0"},
+            method="GET",
+        )
+        with urllib.request.urlopen(req, timeout=8) as resp:
+            raw = resp.read(max_chars * 2).decode("utf-8", errors="ignore")
+        return _clean_text(raw)[:max_chars]
+    except Exception:
+        return ""
+
+
+def _make_prompt(
+    search_rows: list[SearchResult],
+    specialty: str,
+    regions: list[str],
+    excerpts: list[dict[str, str]],
+) -> str:
+    search_payload = [
+        {"title": r.title, "snippet": r.snippet, "url": r.url}
+        for r in search_rows
+    ]
+    rules = (
+        "Du extrahierst medizinische Kongresse/Weiterbildungen als STRICT JSON.\n"
+        "NUR aus den gelieferten Daten, NICHT raten, NICHT erfinden.\n"
+        "Wenn Datum/Ort unklar: null setzen und confidence reduzieren.\n"
+        "urlCandidate MUSS eine gelieferte URL oder eine URL aus pageExcerpts sein.\n"
+        "Antwortformat: JSON-Objekt mit key 'events' (Array).\n"
+        "Jedes Event: {name,startDate,endDate,city,country,urlCandidate,shortDescription,organizer,specialtyTags,regionTags,rationale,confidence}\n"
+        "startDate/endDate als ISO YYYY-MM-DD oder null.\n"
+        "confidence zwischen 0 und 1."
+    )
+    payload = {
+        "specialty": specialty,
+        "regions": regions,
+        "searchResults": search_payload,
+        "pageExcerpts": excerpts,
+    }
+    return rules + "\n\nDATA:\n" + json.dumps(payload, ensure_ascii=False)
+
+
+def _extract_json_block(text: str) -> dict:
+    try:
+        data = json.loads(text)
+        if isinstance(data, dict):
+            return data
+    except Exception:
+        pass
+    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
+    if not match:
+        return {"events": []}
+    try:
+        data = json.loads(match.group(0))
+        if isinstance(data, dict):
+            return data
+    except Exception:
+        pass
+    return {"events": []}
+
+
+def _call_openai_json(prompt: str) -> dict:
+    key = os.getenv("OPENAI_API_KEY", "").strip()
+    if not key:
+        raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
+    model = os.getenv("EVENT_LLM_MODEL", "gpt-4o-mini").strip()
+    client = OpenAI(api_key=key)
+    resp = client.chat.completions.create(
+        model=model,
+        temperature=0,
+        messages=[
+            {"role": "system", "content": "Gib nur valides JSON zurück."},
+            {"role": "user", "content": prompt},
+        ],
+    )
+    txt = ""
+    try:
+        txt = (resp.choices[0].message.content or "").strip()
+    except Exception:
+        txt = ""
+    return _extract_json_block(txt)
+
+
+def _call_gemini_json(prompt: str) -> dict:
+    key = os.getenv("GEMINI_API_KEY", "").strip()
+    if not key:
+        raise RuntimeError("GEMINI_API_KEY nicht gesetzt")
+    model = os.getenv("GEMINI_MODEL", "gemini-1.5-flash").strip()
+    url = (
+        f"https://generativelanguage.googleapis.com/v1beta/models/"
+        f"{urllib.parse.quote(model)}:generateContent?key={urllib.parse.quote(key)}"
+    )
+    body = {
+        "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {"temperature": 0},
+    }
+    req = urllib.request.Request(
+        url,
+        headers={"Content-Type": "application/json"},
+        data=json.dumps(body).encode("utf-8"),
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=20) as resp:
+        raw = resp.read().decode("utf-8", errors="ignore")
+    data = json.loads(raw)
+    txt = ""
+    try:
+        txt = data["candidates"][0]["content"]["parts"][0]["text"]
+    except Exception:
+        txt = ""
+    return _extract_json_block(txt)
+
+
+def _normalize_candidate(row: dict, default_specialty: str, default_regions: list[str]) -> EventCandidate | None:
+    if not isinstance(row, dict):
+        return None
+    name = str(row.get("name") or "").strip()
+    url_candidate = str(row.get("urlCandidate") or "").strip()
+    if not name or not url_candidate:
+        return None
+    try:
+        confidence = float(row.get("confidence", 0.0))
+    except Exception:
+        confidence = 0.0
+    confidence = max(0.0, min(1.0, confidence))
+    specialty_tags = row.get("specialtyTags") if isinstance(row.get("specialtyTags"), list) else [default_specialty]
+    region_tags = row.get("regionTags") if isinstance(row.get("regionTags"), list) else list(default_regions)
+    return EventCandidate(
+        name=name,
+        startDate=_safe_date(row.get("startDate")),
+        endDate=_safe_date(row.get("endDate")) or _safe_date(row.get("startDate")),
+        city=str(row.get("city") or "").strip(),
+        country=str(row.get("country") or "").strip(),
+        urlCandidate=url_candidate,
+        shortDescription=str(row.get("shortDescription") or "").strip()[:600],
+        organizer=str(row.get("organizer") or "").strip(),
+        specialtyTags=[str(x).strip().lower() for x in specialty_tags if str(x).strip()],
+        regionTags=[str(x).strip().upper() for x in region_tags if str(x).strip()],
+        rationale=str(row.get("rationale") or "").strip()[:300],
+        confidence=confidence,
+    )
+
+
+def extract_event_candidates(
+    search_rows: list[SearchResult],
+    specialty: str,
+    regions: list[str],
+) -> list[EventCandidate]:
+    if not search_rows:
+        return []
+    top_rows = list(search_rows[:10])
+    excerpts = []
+    for row in top_rows:
+        text = _extract_page_excerpt(row.url)
+        if text:
+            excerpts.append({"url": row.url, "excerpt": text})
+    prompt = _make_prompt(top_rows, specialty=specialty, regions=regions, excerpts=excerpts)
+    provider = os.getenv("LLM_PROVIDER", "openai").strip().lower()
+    if provider == "gemini":
+        payload = _call_gemini_json(prompt)
+    else:
+        payload = _call_openai_json(prompt)
+    rows = payload.get("events") if isinstance(payload.get("events"), list) else []
+    out: list[EventCandidate] = []
+    for row in rows:
+        cand = _normalize_candidate(row, default_specialty=specialty, default_regions=regions)
+        if cand is not None:
+            out.append(cand)
+    return out
+